1use anyhow::{Context, Result};
31use chrono::{DateTime, Utc};
32use reqwest::Client;
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use std::sync::Arc;
36use tracing::{debug, error, info};
37
38use crate::{Config, SearchResult};
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct SearchDocument {
45 pub id: String,
47 pub title: String,
49 pub url: String,
51 pub content: String,
53 pub snippet: String,
55 pub domain: String,
57 pub indexed_at: DateTime<Utc>,
59 pub last_crawled: DateTime<Utc>,
61}
62
63#[derive(Debug, Serialize, Deserialize)]
67pub struct SearchRequest {
68 pub search: String,
70 #[serde(rename = "$top")]
72 pub top: usize,
73 #[serde(rename = "$skip")]
75 pub skip: usize,
76 #[serde(rename = "$select")]
78 pub select: Option<String>,
79 pub highlight: Option<String>,
80 #[serde(rename = "highlightPreTag")]
81 pub highlight_pre_tag: Option<String>,
82 #[serde(rename = "highlightPostTag")]
83 pub highlight_post_tag: Option<String>,
84}
85
86#[derive(Debug, Deserialize)]
87pub struct SearchResponse {
88 pub value: Vec<SearchHit>,
89 #[serde(rename = "@odata.count")]
90 #[allow(dead_code)]
91 pub count: Option<usize>,
92}
93
94#[derive(Debug, Deserialize)]
95pub struct SearchHit {
96 pub id: String,
97 pub title: String,
98 pub url: String,
99 #[allow(dead_code)]
100 pub content: Option<String>,
101 pub snippet: String,
102 #[allow(dead_code)]
103 pub domain: String,
104 pub indexed_at: DateTime<Utc>,
105 #[serde(rename = "@search.score")]
106 pub score: f64,
107 #[serde(rename = "@search.highlights")]
108 #[allow(dead_code)]
109 pub highlights: Option<HashMap<String, Vec<String>>>,
110}
111
112#[derive(Debug, Serialize)]
113pub struct IndexSchema {
114 pub name: String,
115 pub fields: Vec<IndexField>,
116}
117
118#[derive(Debug, Serialize)]
119pub struct IndexField {
120 pub name: String,
121 #[serde(rename = "type")]
122 pub field_type: String,
123 pub searchable: Option<bool>,
124 pub filterable: Option<bool>,
125 pub sortable: Option<bool>,
126 pub facetable: Option<bool>,
127 pub key: Option<bool>,
128 pub retrievable: Option<bool>,
129}
130
131pub struct SearchService {
132 client: Client,
133 config: Arc<Config>,
134}
135
136impl SearchService {
137 pub async fn new(config: Arc<Config>) -> Result<Self> {
138 let client = Client::builder()
139 .user_agent(&config.application.user_agent)
140 .build()
141 .context("Failed to create HTTP client")?;
142
143 let service = Self { client, config };
144
145 service.ensure_index_exists().await?;
147
148 Ok(service)
149 }
150
151 pub async fn search(
152 &self,
153 query: &str,
154 limit: usize,
155 offset: usize,
156 ) -> Result<Vec<SearchResult>> {
157 info!(
158 "🔍 SEARCH REQUEST: Query='{}', limit={}, offset={}",
159 query, limit, offset
160 );
161
162 let query_encoded = query.replace(" ", "%20").replace("+", "%2B");
165 let mut url = format!(
166 "{}?api-version={}&search={}&$top={}&$skip={}",
167 self.config.search_query_url(),
168 self.config.azure.search_api_version,
169 query_encoded,
170 limit,
171 offset
172 );
173
174 url.push_str("&$select=id,title,url,snippet,domain,indexed_at");
176 url.push_str("&highlight=content,title");
177 url.push_str("&highlightPreTag=%3Cmark%3E"); url.push_str("&highlightPostTag=%3C%2Fmark%3E"); debug!(
181 "🔍 SEARCH API: Sending request to Azure Cognitive Search at {}",
182 url
183 );
184
185 let start_time = std::time::Instant::now();
186
187 let response = self
188 .client
189 .get(&url)
190 .header("api-key", &self.config.azure.search_api_key)
191 .send()
192 .await
193 .context("Failed to send search request")?;
194
195 let elapsed = start_time.elapsed();
196 let status = response.status();
197
198 if !status.is_success() {
199 let error_text = response.text().await.unwrap_or_default();
200 error!(
201 "❌ SEARCH FAILED: Query='{}' failed with status {} in {}ms - {}",
202 query,
203 status,
204 elapsed.as_millis(),
205 error_text
206 );
207 return Err(anyhow::anyhow!(
208 "Search request failed with status {}: {}",
209 status,
210 error_text
211 ));
212 }
213
214 debug!(
215 "✅ SEARCH RESPONSE: Received response in {}ms with status {}",
216 elapsed.as_millis(),
217 status
218 );
219
220 let search_response: SearchResponse = response
221 .json()
222 .await
223 .context("Failed to parse search response")?;
224
225 let results: Vec<SearchResult> = search_response
226 .value
227 .into_iter()
228 .map(|hit| SearchResult {
229 id: hit.id,
230 title: hit.title,
231 url: hit.url,
232 snippet: hit.snippet,
233 score: hit.score,
234 indexed_at: hit.indexed_at,
235 })
236 .collect();
237
238 let total_time = start_time.elapsed();
239 info!(
240 "🎯 SEARCH COMPLETE: Query='{}' returned {} results in {}ms (processing time: {}ms)",
241 query,
242 results.len(),
243 total_time.as_millis(),
244 elapsed.as_millis()
245 );
246
247 if !results.is_empty() {
249 let top_score = results.iter().map(|r| r.score).fold(0.0, f64::max);
250 let avg_score = results.iter().map(|r| r.score).sum::<f64>() / results.len() as f64;
251 debug!(
252 "📊 SEARCH SCORES: Top score: {:.3}, Average score: {:.3}, Results: {}",
253 top_score,
254 avg_score,
255 results.len()
256 );
257 }
258
259 Ok(results)
260 }
261
262 pub async fn index_document(&self, document: &SearchDocument) -> Result<()> {
263 debug!(
264 "📝 INDEX DOCUMENT: Starting to index {} - title: '{}'",
265 document.url, document.title
266 );
267
268 let url = format!(
269 "{}/index?api-version={}",
270 self.config.search_documents_url(),
271 self.config.azure.search_api_version
272 );
273
274 let index_request = serde_json::json!({
275 "value": [
276 {
277 "@search.action": "upload",
278 "id": document.id,
279 "title": document.title,
280 "url": document.url,
281 "content": document.content,
282 "snippet": document.snippet,
283 "domain": document.domain,
284 "indexed_at": document.indexed_at,
285 "last_crawled": document.last_crawled,
286 }
287 ]
288 });
289
290 debug!(
291 "📤 INDEX API: Sending document to Azure Cognitive Search index at {}",
292 url
293 );
294 debug!(
295 "📄 DOCUMENT DETAILS: ID={}, Domain={}, Content size={} chars",
296 document.id,
297 document.domain,
298 document.content.len()
299 );
300
301 let start_time = std::time::Instant::now();
302
303 let response = self
304 .client
305 .post(&url)
306 .header("api-key", &self.config.azure.search_api_key)
307 .header("Content-Type", "application/json")
308 .json(&index_request)
309 .send()
310 .await
311 .context("Failed to send index request")?;
312
313 let elapsed = start_time.elapsed();
314 let status = response.status();
315
316 if !status.is_success() {
317 let error_text = response.text().await.unwrap_or_default();
318 error!(
319 "❌ INDEX FAILED: Document {} failed to index with status {} in {}ms - {}",
320 document.url,
321 status,
322 elapsed.as_millis(),
323 error_text
324 );
325 return Err(anyhow::anyhow!(
326 "Index request failed with status {}: {}",
327 status,
328 error_text
329 ));
330 }
331
332 crate::log_and_capture!(
333 info,
334 "✅ INDEXED: Document {} successfully indexed in {}ms (title: '{}', {} chars)",
335 document.url,
336 elapsed.as_millis(),
337 document.title,
338 document.content.len()
339 );
340 Ok(())
341 }
342
343 async fn ensure_index_exists(&self) -> Result<()> {
344 info!(
345 "🔍 INDEX CHECK: Verifying search index '{}' exists",
346 self.config.azure.search_index_name
347 );
348
349 let url = format!(
350 "{}?api-version={}",
351 self.config.search_index_url(),
352 self.config.azure.search_api_version
353 );
354
355 debug!("📡 INDEX API: Checking index existence at {}", url);
357 let response = self
358 .client
359 .get(&url)
360 .header("api-key", &self.config.azure.search_api_key)
361 .send()
362 .await
363 .context("Failed to check if index exists")?;
364
365 if response.status().is_success() {
366 info!(
367 "✅ INDEX EXISTS: Search index '{}' is already available",
368 self.config.azure.search_index_name
369 );
370 return Ok(());
371 }
372
373 info!(
375 "🛠️ CREATING INDEX: Search index '{}' not found, creating new index",
376 self.config.azure.search_index_name
377 );
378
379 let schema = IndexSchema {
380 name: self.config.azure.search_index_name.clone(),
381 fields: vec![
382 IndexField {
383 name: "id".to_string(),
384 field_type: "Edm.String".to_string(),
385 key: Some(true),
386 searchable: Some(false),
387 filterable: Some(true),
388 sortable: Some(false),
389 facetable: Some(false),
390 retrievable: Some(true),
391 },
392 IndexField {
393 name: "title".to_string(),
394 field_type: "Edm.String".to_string(),
395 key: Some(false),
396 searchable: Some(true),
397 filterable: Some(false),
398 sortable: Some(false),
399 facetable: Some(false),
400 retrievable: Some(true),
401 },
402 IndexField {
403 name: "url".to_string(),
404 field_type: "Edm.String".to_string(),
405 key: Some(false),
406 searchable: Some(false),
407 filterable: Some(true),
408 sortable: Some(false),
409 facetable: Some(false),
410 retrievable: Some(true),
411 },
412 IndexField {
413 name: "content".to_string(),
414 field_type: "Edm.String".to_string(),
415 key: Some(false),
416 searchable: Some(true),
417 filterable: Some(false),
418 sortable: Some(false),
419 facetable: Some(false),
420 retrievable: Some(false),
421 },
422 IndexField {
423 name: "snippet".to_string(),
424 field_type: "Edm.String".to_string(),
425 key: Some(false),
426 searchable: Some(false),
427 filterable: Some(false),
428 sortable: Some(false),
429 facetable: Some(false),
430 retrievable: Some(true),
431 },
432 IndexField {
433 name: "domain".to_string(),
434 field_type: "Edm.String".to_string(),
435 key: Some(false),
436 searchable: Some(false),
437 filterable: Some(true),
438 sortable: Some(false),
439 facetable: Some(true),
440 retrievable: Some(true),
441 },
442 IndexField {
443 name: "indexed_at".to_string(),
444 field_type: "Edm.DateTimeOffset".to_string(),
445 key: Some(false),
446 searchable: Some(false),
447 filterable: Some(true),
448 sortable: Some(true),
449 facetable: Some(false),
450 retrievable: Some(true),
451 },
452 IndexField {
453 name: "last_crawled".to_string(),
454 field_type: "Edm.DateTimeOffset".to_string(),
455 key: Some(false),
456 searchable: Some(false),
457 filterable: Some(true),
458 sortable: Some(true),
459 facetable: Some(false),
460 retrievable: Some(false),
461 },
462 ],
463 };
464
465 let create_url = format!(
466 "https://{}.search.windows.net/indexes?api-version={}",
467 self.config.azure.search_service_name, self.config.azure.search_api_version
468 );
469
470 debug!(
471 "🛠️ INDEX CREATE API: Sending create request to {}",
472 create_url
473 );
474 debug!(
475 "📋 INDEX SCHEMA: Creating index with {} fields",
476 schema.fields.len()
477 );
478
479 let response = self
480 .client
481 .post(&create_url)
482 .header("api-key", &self.config.azure.search_api_key)
483 .header("Content-Type", "application/json")
484 .json(&schema)
485 .send()
486 .await
487 .context("Failed to create search index")?;
488
489 if !response.status().is_success() {
490 let status = response.status();
491 let error_text = response.text().await.unwrap_or_default();
492 error!(
493 "❌ INDEX CREATE FAILED: Failed to create index '{}' with status {} - {}",
494 self.config.azure.search_index_name, status, error_text
495 );
496 return Err(anyhow::anyhow!(
497 "Failed to create search index with status {}: {}",
498 status,
499 error_text
500 ));
501 }
502
503 info!(
504 "✅ INDEX CREATED: Successfully created search index '{}' with {} fields",
505 self.config.azure.search_index_name,
506 schema.fields.len()
507 );
508 Ok(())
509 }
510}
511
512#[cfg(test)]
513mod tests {
514 use super::*;
515 use std::sync::Arc;
516
517 fn create_test_config() -> Arc<Config> {
518 Arc::new(Config {
519 environment: "test".to_string(),
520 azure: crate::config::AzureConfig {
521 search_service_name: "test-service".to_string(),
522 search_api_key: "test-key".to_string(),
523 search_api_version: "2023-11-01".to_string(),
524 search_index_name: "test-index".to_string(),
525 cosmos_endpoint: "test".to_string(),
526 cosmos_key: "test".to_string(),
527 cosmos_database_name: "test".to_string(),
528 cosmos_container_name: "test".to_string(),
529 },
530 application: crate::config::ApplicationConfig {
531 max_crawl_depth: 5,
532 crawl_delay_ms: 1000,
533 max_concurrent_requests: 10,
534 user_agent: "test".to_string(),
535 allowed_domains: vec![],
536 periodic_index_interval_days: 7,
537 duplicate_removal_interval_hours: 24,
538 admin_api_key: "test".to_string(),
539 },
540 })
541 }
542
543 #[test]
544 fn test_search_url_construction() {
545 let config = create_test_config();
546
547 let expected_base = "https://test-service.search.windows.net/indexes/test-index/docs";
549 assert_eq!(config.search_query_url(), expected_base);
550
551 let test_query = "web development tutorial";
553 let encoded = test_query.replace(" ", "%20").replace("+", "%2B");
554 assert_eq!(encoded, "web%20development%20tutorial");
555
556 let test_query_with_plus = "C++ programming";
557 let encoded_plus = test_query_with_plus.replace(" ", "%20").replace("+", "%2B");
558 assert_eq!(encoded_plus, "C%2B%2B%20programming");
559 }
560
561 #[test]
562 fn test_search_request_structure_removed() {
563 let search_request = SearchRequest {
569 search: "test".to_string(),
570 top: 10,
571 skip: 0,
572 select: Some("id,title".to_string()),
573 highlight: Some("content".to_string()),
574 highlight_pre_tag: Some("<mark>".to_string()),
575 highlight_post_tag: Some("</mark>".to_string()),
576 };
577
578 assert_eq!(search_request.search, "test");
580 assert_eq!(search_request.top, 10);
581 assert_eq!(search_request.skip, 0);
582 }
583}