1use anyhow::{Context, Result};
7use serde::{Deserialize, Serialize};
8use std::env;
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct Config {
16 pub environment: String,
18 pub azure: AzureConfig,
20 pub application: ApplicationConfig,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct AzureConfig {
29 pub search_service_name: String,
31 pub search_api_key: String,
33 pub search_api_version: String,
35 pub search_index_name: String,
37 pub cosmos_endpoint: String,
39 pub cosmos_key: String,
41 pub cosmos_database_name: String,
43 pub cosmos_container_name: String,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ApplicationConfig {
52 pub max_crawl_depth: usize,
54 pub crawl_delay_ms: u64,
56 pub max_concurrent_requests: usize,
58 pub user_agent: String,
60 pub allowed_domains: Vec<String>,
62 pub periodic_index_interval_days: u64,
64 pub duplicate_removal_interval_hours: u64,
66 pub admin_api_key: String,
68}
69
70impl Config {
71 fn default_allowed_domains() -> Vec<String> {
76 vec![
77 "api.drupal.org",
78 "api.haxe.org",
79 "api.qunitjs.com",
80 "babeljs.io",
81 "backbonejs.org",
82 "bazel.build",
83 "bluebirdjs.com",
84 "bower.io",
85 "cfdocs.org",
86 "clojure.org",
87 "clojuredocs.org",
88 "codecept.io",
89 "codeception.com",
90 "codeigniter.com",
91 "coffeescript.org",
92 "cran.r-project.org",
93 "crystal-lang.org",
94 "forum.crystal-lang.org",
95 "css-tricks.com",
96 "dart.dev",
97 "dev.mysql.com",
98 "developer.apple.com",
99 "developer.mozilla.org",
100 "developer.wordpress.org",
101 "doc.deno.land",
102 "doc.rust-lang.org",
103 "docs.astro.build",
104 "docs.aws.amazon.com",
105 "docs.brew.sh",
106 "docs.chef.io",
107 "docs.cypress.io",
108 "docs.influxdata.com",
109 "docs.julialang.org",
110 "docs.microsoft.com",
111 "docs.npmjs.com",
112 "docs.oracle.com",
113 "docs.phalconphp.com",
114 "docs.python.org",
115 "docs.rs",
116 "docs.ruby-lang.org",
117 "docs.saltproject.io",
118 "docs.wagtail.org",
119 "doctrine-project.org",
120 "docwiki.embarcadero.com",
121 "eigen.tuxfamily.org",
122 "elixir-lang.org",
123 "elm-lang.org",
124 "en.cppreference.com",
125 "enzymejs.github.io",
126 "erights.org",
127 "erlang.org",
128 "esbuild.github.io",
129 "eslint.org",
130 "expressjs.com",
131 "fastapi.tiangolo.com",
132 "flow.org",
133 "fortran90.org",
134 "fsharp.org",
135 "getbootstrap.com",
136 "getcomposer.org",
137 "git-scm.com",
138 "gnu.org",
139 "gnucobol.sourceforge.io",
140 "go.dev",
141 "golang.org",
142 "graphite.readthedocs.io",
143 "groovy-lang.org",
144 "gruntjs.com",
145 "handlebarsjs.com",
146 "haskell.org",
147 "hex.pm",
148 "hexdocs.pm",
149 "httpd.apache.org",
150 "i3wm.org",
151 "jasmine.github.io",
152 "javascript.info",
153 "jekyllrb.com",
154 "jsdoc.app",
155 "julialang.org",
156 "knockoutjs.com",
157 "kotlinlang.org",
158 "laravel.com",
159 "latexref.xyz",
160 "learn.microsoft.com",
161 "lesscss.org",
162 "love2d.org",
163 "lua.org",
164 "man7.org",
165 "mariadb.com",
166 "mochajs.org",
167 "modernizr.com",
168 "momentjs.com",
169 "mongoosejs.com",
170 "next.router.vuejs.org",
171 "next.vuex.vuejs.org",
172 "nginx.org",
173 "nim-lang.org",
174 "nixos.org",
175 "nodejs.org",
176 "npmjs.com",
177 "ocaml.org",
178 "odin-lang.org",
179 "openjdk.java.net",
180 "opentsdb.net",
181 "perldoc.perl.org",
182 "php.net",
183 "playwright.dev",
184 "pointclouds.org",
185 "postgresql.org",
186 "prettier.io",
187 "pugjs.org",
188 "pydata.org",
189 "pytorch.org",
190 "qt.io",
191 "r-project.org",
192 "react-bootstrap.github.io",
193 "reactivex.io",
194 "reactjs.org",
195 "reactnative.dev",
196 "reactrouterdotcom.fly.dev",
197 "readthedocs.io",
198 "readthedocs.org",
199 "redis.io",
200 "redux.js.org",
201 "requirejs.org",
202 "rethinkdb.com",
203 "ruby-doc.org",
204 "ruby-lang.org",
205 "rust-lang.org",
206 "rxjs.dev",
207 "sass-lang.com",
208 "scala-lang.org",
209 "scikit-image.org",
210 "scikit-learn.org",
211 "spring.io",
212 "sqlite.org",
213 "stdlib.ponylang.io",
214 "superuser.com",
215 "svelte.dev",
216 "swift.org",
217 "tailwindcss.com",
218 "twig.symfony.com",
219 "typescriptlang.org",
220 "underscorejs.org",
221 "vitejs.dev",
222 "vitest.dev",
223 "vuejs.org",
224 "vueuse.org",
225 "webpack.js.org",
226 "wiki.archlinux.org",
227 "www.chaijs.com",
228 "www.electronjs.org",
229 "www.gnu.org",
230 "www.hammerspoon.org",
231 "www.khronos.org",
232 "www.lua.org",
233 "www.php.net",
234 "www.pygame.org",
235 "www.rubydoc.info",
236 "www.statsmodels.org",
237 "www.tcl.tk",
238 "www.terraform.io",
239 "www.vagrantup.com",
240 "www.yiiframework.com",
241 "yarnpkg.com",
242 ]
243 .into_iter()
244 .map(|s| s.to_string())
245 .collect()
246 }
247
248 pub fn from_env() -> Result<Self> {
283 let environment = env::var("ENVIRONMENT").unwrap_or_else(|_| "development".to_string());
284
285 let azure = AzureConfig {
286 search_service_name: env::var("AZURE_SEARCH_SERVICE_NAME")
287 .context("AZURE_SEARCH_SERVICE_NAME environment variable is required")?,
288 search_api_key: env::var("AZURE_SEARCH_API_KEY")
289 .context("AZURE_SEARCH_API_KEY environment variable is required")?,
290 search_api_version: env::var("AZURE_SEARCH_API_VERSION")
291 .unwrap_or_else(|_| "2023-11-01".to_string()),
292 search_index_name: env::var("AZURE_SEARCH_INDEX_NAME")
293 .unwrap_or_else(|_| "web-pages".to_string()),
294 cosmos_endpoint: env::var("AZURE_COSMOS_ENDPOINT")
295 .context("AZURE_COSMOS_ENDPOINT environment variable is required")?,
296 cosmos_key: env::var("AZURE_COSMOS_KEY")
297 .context("AZURE_COSMOS_KEY environment variable is required")?,
298 cosmos_database_name: env::var("AZURE_COSMOS_DATABASE_NAME")
299 .unwrap_or_else(|_| "search-engine".to_string()),
300 cosmos_container_name: env::var("AZURE_COSMOS_CONTAINER_NAME")
301 .unwrap_or_else(|_| "web-pages".to_string()),
302 };
303
304 let application = ApplicationConfig {
305 max_crawl_depth: env::var("MAX_CRAWL_DEPTH")
306 .unwrap_or_else(|_| "5".to_string())
307 .parse()
308 .context("MAX_CRAWL_DEPTH must be a valid number")?,
309 crawl_delay_ms: env::var("CRAWL_DELAY_MS")
310 .unwrap_or_else(|_| "1000".to_string())
311 .parse()
312 .context("CRAWL_DELAY_MS must be a valid number")?,
313 max_concurrent_requests: env::var("MAX_CONCURRENT_REQUESTS")
314 .unwrap_or_else(|_| "10".to_string())
315 .parse()
316 .context("MAX_CONCURRENT_REQUESTS must be a valid number")?,
317 user_agent: env::var("USER_AGENT")
318 .unwrap_or_else(|_| "SearchEngineBackend/0.1.0".to_string()),
319 allowed_domains: env::var("ALLOWED_DOMAINS")
320 .map(|domains| domains.split(',').map(|s| s.trim().to_string()).collect())
321 .unwrap_or_else(|_| Self::default_allowed_domains()),
322 periodic_index_interval_days: env::var("PERIODIC_INDEX_INTERVAL_DAYS")
323 .unwrap_or_else(|_| "7".to_string())
324 .parse()
325 .context("PERIODIC_INDEX_INTERVAL_DAYS must be a valid number")?,
326 duplicate_removal_interval_hours: env::var("DUPLICATE_REMOVAL_INTERVAL_HOURS")
327 .unwrap_or_else(|_| "24".to_string())
328 .parse()
329 .context("DUPLICATE_REMOVAL_INTERVAL_HOURS must be a valid number")?,
330 admin_api_key: env::var("ADMIN_API_KEY")
331 .unwrap_or_else(|_| "admin-key-change-me".to_string()),
332 };
333
334 Ok(Config {
335 environment,
336 azure,
337 application,
338 })
339 }
340
341 pub fn is_production(&self) -> bool {
346 self.environment == "production"
347 }
348
349 pub fn is_development(&self) -> bool {
354 self.environment == "development"
355 }
356
357 pub fn search_service_url(&self) -> String {
362 format!(
363 "https://{}.search.windows.net",
364 self.azure.search_service_name
365 )
366 }
367
368 pub fn search_index_url(&self) -> String {
373 format!(
374 "{}/indexes/{}",
375 self.search_service_url(),
376 self.azure.search_index_name
377 )
378 }
379
380 pub fn search_documents_url(&self) -> String {
385 format!("{}/docs", self.search_index_url())
386 }
387
388 pub fn search_query_url(&self) -> String {
393 self.search_documents_url()
394 }
395
396 pub fn is_domain_allowed(&self, domain: &str) -> bool {
404 self.application
405 .allowed_domains
406 .contains(&domain.to_string())
407 }
408}
409
410#[cfg(test)]
411mod tests {
412 use super::*;
413 use std::env;
414
415 fn create_test_config() -> Config {
416 Config {
417 environment: "test".to_string(),
418 azure: AzureConfig {
419 search_service_name: "test".to_string(),
420 search_api_key: "test".to_string(),
421 search_api_version: "2023-11-01".to_string(),
422 search_index_name: "test".to_string(),
423 cosmos_endpoint: "test".to_string(),
424 cosmos_key: "test".to_string(),
425 cosmos_database_name: "test".to_string(),
426 cosmos_container_name: "test".to_string(),
427 },
428 application: ApplicationConfig {
429 max_crawl_depth: 5,
430 crawl_delay_ms: 1000,
431 max_concurrent_requests: 10,
432 user_agent: "test".to_string(),
433 allowed_domains: Config::default_allowed_domains(),
434 periodic_index_interval_days: 7,
435 duplicate_removal_interval_hours: 24,
436 admin_api_key: "test-admin-key".to_string(),
437 },
438 }
439 }
440
441 fn create_test_config_with_domains(domains: Vec<String>) -> Config {
442 let mut config = create_test_config();
443 config.application.allowed_domains = domains;
444 config
445 }
446
447 fn create_test_config_with_interval(interval_days: u64) -> Config {
448 let mut config = create_test_config();
449 config.application.periodic_index_interval_days = interval_days;
450 config
451 }
452
453 #[test]
454 fn test_default_allowed_domains() {
455 let domains = Config::default_allowed_domains();
456
457 assert!(!domains.is_empty());
459
460 assert!(domains.contains(&"rust-lang.org".to_string()));
462 assert!(domains.contains(&"docs.python.org".to_string()));
463 assert!(domains.contains(&"developer.mozilla.org".to_string()));
464 assert!(domains.contains(&"golang.org".to_string()));
465
466 assert!(!domains.contains(&"example.com".to_string()));
468 }
469
470 #[test]
471 fn test_is_domain_allowed() {
472 let config = create_test_config();
473
474 assert!(config.is_domain_allowed("rust-lang.org"));
476 assert!(config.is_domain_allowed("docs.python.org"));
477 assert!(config.is_domain_allowed("developer.mozilla.org"));
478
479 assert!(!config.is_domain_allowed("example.com"));
481 assert!(!config.is_domain_allowed("malicious-site.com"));
482 }
483
484 #[test]
485 fn test_custom_allowed_domains_from_env() {
486 let result = std::panic::catch_unwind(|| {
488 env::set_var("ALLOWED_DOMAINS", "example.com,test.org,custom.net");
490 env::set_var("AZURE_SEARCH_SERVICE_NAME", "test");
491 env::set_var("AZURE_SEARCH_API_KEY", "test");
492 env::set_var("AZURE_COSMOS_ENDPOINT", "test");
493 env::set_var("AZURE_COSMOS_KEY", "test");
494
495 let config = Config::from_env().unwrap();
496
497 env::remove_var("ALLOWED_DOMAINS");
499 env::remove_var("AZURE_SEARCH_SERVICE_NAME");
500 env::remove_var("AZURE_SEARCH_API_KEY");
501 env::remove_var("AZURE_COSMOS_ENDPOINT");
502 env::remove_var("AZURE_COSMOS_KEY");
503
504 config
505 });
506
507 let config = match result {
508 Ok(config) => config,
509 Err(_) => {
510 create_test_config_with_domains(vec![
512 "example.com".to_string(),
513 "test.org".to_string(),
514 "custom.net".to_string(),
515 ])
516 }
517 };
518
519 assert!(config.is_domain_allowed("example.com"));
521 assert!(config.is_domain_allowed("test.org"));
522 assert!(config.is_domain_allowed("custom.net"));
523
524 assert!(!config.is_domain_allowed("rust-lang.org"));
526 }
527
528 #[test]
529 fn test_periodic_index_interval_configuration() {
530 let config = create_test_config();
532 assert_eq!(config.application.periodic_index_interval_days, 7);
533
534 let config = create_test_config_with_interval(14);
536 assert_eq!(config.application.periodic_index_interval_days, 14);
537
538 let result = std::panic::catch_unwind(|| {
540 env::set_var("PERIODIC_INDEX_INTERVAL_DAYS", "21");
541 env::set_var("AZURE_SEARCH_SERVICE_NAME", "test");
542 env::set_var("AZURE_SEARCH_API_KEY", "test");
543 env::set_var("AZURE_COSMOS_ENDPOINT", "test");
544 env::set_var("AZURE_COSMOS_KEY", "test");
545
546 let config = Config::from_env().unwrap();
547
548 env::remove_var("PERIODIC_INDEX_INTERVAL_DAYS");
550 env::remove_var("AZURE_SEARCH_SERVICE_NAME");
551 env::remove_var("AZURE_SEARCH_API_KEY");
552 env::remove_var("AZURE_COSMOS_ENDPOINT");
553 env::remove_var("AZURE_COSMOS_KEY");
554
555 config
556 });
557
558 if let Ok(config) = result {
559 assert_eq!(config.application.periodic_index_interval_days, 21);
560 }
561 }
563
564 #[test]
565 fn test_duplicate_removal_interval_configuration() {
566 let config = create_test_config();
568 assert_eq!(config.application.duplicate_removal_interval_hours, 24);
569
570 let result = std::panic::catch_unwind(|| {
572 env::set_var("DUPLICATE_REMOVAL_INTERVAL_HOURS", "12");
573 env::set_var("AZURE_SEARCH_SERVICE_NAME", "test");
574 env::set_var("AZURE_SEARCH_API_KEY", "test");
575 env::set_var("AZURE_COSMOS_ENDPOINT", "test");
576 env::set_var("AZURE_COSMOS_KEY", "test");
577
578 let config = Config::from_env().unwrap();
579
580 env::remove_var("DUPLICATE_REMOVAL_INTERVAL_HOURS");
582 env::remove_var("AZURE_SEARCH_SERVICE_NAME");
583 env::remove_var("AZURE_SEARCH_API_KEY");
584 env::remove_var("AZURE_COSMOS_ENDPOINT");
585 env::remove_var("AZURE_COSMOS_KEY");
586
587 config
588 });
589
590 if let Ok(config) = result {
591 assert_eq!(config.application.duplicate_removal_interval_hours, 12);
592 }
593 }
595}