search_engine_backend/
search.rs

1//! # Search Service
2//!
3//! This module provides the search functionality using Azure Cognitive Search.
4//! It handles search queries, index management, and document operations.
5//!
6//! ## Key Components
7//!
8//! - [`SearchService`]: Main service for Azure Cognitive Search operations
9//! - [`SearchDocument`]: Document structure for search index
10//! - [`SearchRequest`]: Request structure for search queries
11//! - [`SearchResponse`]: Response structure from search operations
12//!
13//! ## Usage
14//!
15//! ```rust,no_run
16//! use search_engine_backend::{Config, SearchService};
17//! use std::sync::Arc;
18//!
19//! #[tokio::main]
20//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
21//!     let config = Arc::new(Config::from_env()?);
22//!     let search_service = SearchService::new(config).await?;
23//!     
24//!     // Perform a search
25//!     let results = search_service.search("rust programming", 10, 0).await?;
26//!     Ok(())
27//! }
28//! ```
29
30use anyhow::{Context, Result};
31use chrono::{DateTime, Utc};
32use reqwest::Client;
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use std::sync::Arc;
36use tracing::{debug, error, info};
37
38use crate::{Config, SearchResult};
39
40/// Represents a document in the Azure Cognitive Search index.
41///
42/// This structure defines the schema for documents stored in the search index.
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct SearchDocument {
45    /// Unique identifier for the document
46    pub id: String,
47    /// Title of the document
48    pub title: String,
49    /// Original URL of the document
50    pub url: String,
51    /// Full text content for search indexing
52    pub content: String,
53    /// Brief excerpt for display in search results
54    pub snippet: String,
55    /// Domain name the document belongs to
56    pub domain: String,
57    /// Timestamp when the document was indexed
58    pub indexed_at: DateTime<Utc>,
59    /// Timestamp of the last crawl
60    pub last_crawled: DateTime<Utc>,
61}
62
63/// Request structure for Azure Cognitive Search queries.
64///
65/// Maps to the Azure Search REST API query parameters.
66#[derive(Debug, Serialize, Deserialize)]
67pub struct SearchRequest {
68    /// Search query string
69    pub search: String,
70    /// Maximum number of results to return
71    #[serde(rename = "$top")]
72    pub top: usize,
73    /// Number of results to skip for pagination
74    #[serde(rename = "$skip")]
75    pub skip: usize,
76    /// Fields to include in the response
77    #[serde(rename = "$select")]
78    pub select: Option<String>,
79    pub highlight: Option<String>,
80    #[serde(rename = "highlightPreTag")]
81    pub highlight_pre_tag: Option<String>,
82    #[serde(rename = "highlightPostTag")]
83    pub highlight_post_tag: Option<String>,
84}
85
86#[derive(Debug, Deserialize)]
87pub struct SearchResponse {
88    pub value: Vec<SearchHit>,
89    #[serde(rename = "@odata.count")]
90    #[allow(dead_code)]
91    pub count: Option<usize>,
92}
93
94#[derive(Debug, Deserialize)]
95pub struct SearchHit {
96    pub id: String,
97    pub title: String,
98    pub url: String,
99    #[allow(dead_code)]
100    pub content: Option<String>,
101    pub snippet: String,
102    #[allow(dead_code)]
103    pub domain: String,
104    pub indexed_at: DateTime<Utc>,
105    #[serde(rename = "@search.score")]
106    pub score: f64,
107    #[serde(rename = "@search.highlights")]
108    #[allow(dead_code)]
109    pub highlights: Option<HashMap<String, Vec<String>>>,
110}
111
112#[derive(Debug, Serialize)]
113pub struct IndexSchema {
114    pub name: String,
115    pub fields: Vec<IndexField>,
116}
117
118#[derive(Debug, Serialize)]
119pub struct IndexField {
120    pub name: String,
121    #[serde(rename = "type")]
122    pub field_type: String,
123    pub searchable: Option<bool>,
124    pub filterable: Option<bool>,
125    pub sortable: Option<bool>,
126    pub facetable: Option<bool>,
127    pub key: Option<bool>,
128    pub retrievable: Option<bool>,
129}
130
131pub struct SearchService {
132    client: Client,
133    config: Arc<Config>,
134}
135
136impl SearchService {
137    pub async fn new(config: Arc<Config>) -> Result<Self> {
138        let client = Client::builder()
139            .user_agent(&config.application.user_agent)
140            .build()
141            .context("Failed to create HTTP client")?;
142
143        let service = Self { client, config };
144
145        // Ensure the search index exists
146        service.ensure_index_exists().await?;
147
148        Ok(service)
149    }
150
151    pub async fn search(
152        &self,
153        query: &str,
154        limit: usize,
155        offset: usize,
156    ) -> Result<Vec<SearchResult>> {
157        info!(
158            "🔍 SEARCH REQUEST: Query='{}', limit={}, offset={}",
159            query, limit, offset
160        );
161
162        // Build URL with query parameters for Azure Cognitive Search API
163        // Simple URL encoding for common characters
164        let query_encoded = query.replace(" ", "%20").replace("+", "%2B");
165        let mut url = format!(
166            "{}?api-version={}&search={}&$top={}&$skip={}",
167            self.config.search_query_url(),
168            self.config.azure.search_api_version,
169            query_encoded,
170            limit,
171            offset
172        );
173
174        // Add optional parameters
175        url.push_str("&$select=id,title,url,snippet,domain,indexed_at");
176        url.push_str("&highlight=content,title");
177        url.push_str("&highlightPreTag=%3Cmark%3E"); // <mark> URL encoded
178        url.push_str("&highlightPostTag=%3C%2Fmark%3E"); // </mark> URL encoded
179
180        debug!(
181            "🔍 SEARCH API: Sending request to Azure Cognitive Search at {}",
182            url
183        );
184
185        let start_time = std::time::Instant::now();
186
187        let response = self
188            .client
189            .get(&url)
190            .header("api-key", &self.config.azure.search_api_key)
191            .send()
192            .await
193            .context("Failed to send search request")?;
194
195        let elapsed = start_time.elapsed();
196        let status = response.status();
197
198        if !status.is_success() {
199            let error_text = response.text().await.unwrap_or_default();
200            error!(
201                "❌ SEARCH FAILED: Query='{}' failed with status {} in {}ms - {}",
202                query,
203                status,
204                elapsed.as_millis(),
205                error_text
206            );
207            return Err(anyhow::anyhow!(
208                "Search request failed with status {}: {}",
209                status,
210                error_text
211            ));
212        }
213
214        debug!(
215            "✅ SEARCH RESPONSE: Received response in {}ms with status {}",
216            elapsed.as_millis(),
217            status
218        );
219
220        let search_response: SearchResponse = response
221            .json()
222            .await
223            .context("Failed to parse search response")?;
224
225        let results: Vec<SearchResult> = search_response
226            .value
227            .into_iter()
228            .map(|hit| SearchResult {
229                id: hit.id,
230                title: hit.title,
231                url: hit.url,
232                snippet: hit.snippet,
233                score: hit.score,
234                indexed_at: hit.indexed_at,
235            })
236            .collect();
237
238        let total_time = start_time.elapsed();
239        info!(
240            "🎯 SEARCH COMPLETE: Query='{}' returned {} results in {}ms (processing time: {}ms)",
241            query,
242            results.len(),
243            total_time.as_millis(),
244            elapsed.as_millis()
245        );
246
247        // Log some result details if we have results
248        if !results.is_empty() {
249            let top_score = results.iter().map(|r| r.score).fold(0.0, f64::max);
250            let avg_score = results.iter().map(|r| r.score).sum::<f64>() / results.len() as f64;
251            debug!(
252                "📊 SEARCH SCORES: Top score: {:.3}, Average score: {:.3}, Results: {}",
253                top_score,
254                avg_score,
255                results.len()
256            );
257        }
258
259        Ok(results)
260    }
261
262    pub async fn index_document(&self, document: &SearchDocument) -> Result<()> {
263        debug!(
264            "📝 INDEX DOCUMENT: Starting to index {} - title: '{}'",
265            document.url, document.title
266        );
267
268        let url = format!(
269            "{}/index?api-version={}",
270            self.config.search_documents_url(),
271            self.config.azure.search_api_version
272        );
273
274        let index_request = serde_json::json!({
275            "value": [
276                {
277                    "@search.action": "upload",
278                    "id": document.id,
279                    "title": document.title,
280                    "url": document.url,
281                    "content": document.content,
282                    "snippet": document.snippet,
283                    "domain": document.domain,
284                    "indexed_at": document.indexed_at,
285                    "last_crawled": document.last_crawled,
286                }
287            ]
288        });
289
290        debug!(
291            "📤 INDEX API: Sending document to Azure Cognitive Search index at {}",
292            url
293        );
294        debug!(
295            "📄 DOCUMENT DETAILS: ID={}, Domain={}, Content size={} chars",
296            document.id,
297            document.domain,
298            document.content.len()
299        );
300
301        let start_time = std::time::Instant::now();
302
303        let response = self
304            .client
305            .post(&url)
306            .header("api-key", &self.config.azure.search_api_key)
307            .header("Content-Type", "application/json")
308            .json(&index_request)
309            .send()
310            .await
311            .context("Failed to send index request")?;
312
313        let elapsed = start_time.elapsed();
314        let status = response.status();
315
316        if !status.is_success() {
317            let error_text = response.text().await.unwrap_or_default();
318            error!(
319                "❌ INDEX FAILED: Document {} failed to index with status {} in {}ms - {}",
320                document.url,
321                status,
322                elapsed.as_millis(),
323                error_text
324            );
325            return Err(anyhow::anyhow!(
326                "Index request failed with status {}: {}",
327                status,
328                error_text
329            ));
330        }
331
332        crate::log_and_capture!(
333            info,
334            "✅ INDEXED: Document {} successfully indexed in {}ms (title: '{}', {} chars)",
335            document.url,
336            elapsed.as_millis(),
337            document.title,
338            document.content.len()
339        );
340        Ok(())
341    }
342
343    async fn ensure_index_exists(&self) -> Result<()> {
344        info!(
345            "🔍 INDEX CHECK: Verifying search index '{}' exists",
346            self.config.azure.search_index_name
347        );
348
349        let url = format!(
350            "{}?api-version={}",
351            self.config.search_index_url(),
352            self.config.azure.search_api_version
353        );
354
355        // First, check if index exists
356        debug!("📡 INDEX API: Checking index existence at {}", url);
357        let response = self
358            .client
359            .get(&url)
360            .header("api-key", &self.config.azure.search_api_key)
361            .send()
362            .await
363            .context("Failed to check if index exists")?;
364
365        if response.status().is_success() {
366            info!(
367                "✅ INDEX EXISTS: Search index '{}' is already available",
368                self.config.azure.search_index_name
369            );
370            return Ok(());
371        }
372
373        // Create the index if it doesn't exist
374        info!(
375            "🛠️ CREATING INDEX: Search index '{}' not found, creating new index",
376            self.config.azure.search_index_name
377        );
378
379        let schema = IndexSchema {
380            name: self.config.azure.search_index_name.clone(),
381            fields: vec![
382                IndexField {
383                    name: "id".to_string(),
384                    field_type: "Edm.String".to_string(),
385                    key: Some(true),
386                    searchable: Some(false),
387                    filterable: Some(true),
388                    sortable: Some(false),
389                    facetable: Some(false),
390                    retrievable: Some(true),
391                },
392                IndexField {
393                    name: "title".to_string(),
394                    field_type: "Edm.String".to_string(),
395                    key: Some(false),
396                    searchable: Some(true),
397                    filterable: Some(false),
398                    sortable: Some(false),
399                    facetable: Some(false),
400                    retrievable: Some(true),
401                },
402                IndexField {
403                    name: "url".to_string(),
404                    field_type: "Edm.String".to_string(),
405                    key: Some(false),
406                    searchable: Some(false),
407                    filterable: Some(true),
408                    sortable: Some(false),
409                    facetable: Some(false),
410                    retrievable: Some(true),
411                },
412                IndexField {
413                    name: "content".to_string(),
414                    field_type: "Edm.String".to_string(),
415                    key: Some(false),
416                    searchable: Some(true),
417                    filterable: Some(false),
418                    sortable: Some(false),
419                    facetable: Some(false),
420                    retrievable: Some(false),
421                },
422                IndexField {
423                    name: "snippet".to_string(),
424                    field_type: "Edm.String".to_string(),
425                    key: Some(false),
426                    searchable: Some(false),
427                    filterable: Some(false),
428                    sortable: Some(false),
429                    facetable: Some(false),
430                    retrievable: Some(true),
431                },
432                IndexField {
433                    name: "domain".to_string(),
434                    field_type: "Edm.String".to_string(),
435                    key: Some(false),
436                    searchable: Some(false),
437                    filterable: Some(true),
438                    sortable: Some(false),
439                    facetable: Some(true),
440                    retrievable: Some(true),
441                },
442                IndexField {
443                    name: "indexed_at".to_string(),
444                    field_type: "Edm.DateTimeOffset".to_string(),
445                    key: Some(false),
446                    searchable: Some(false),
447                    filterable: Some(true),
448                    sortable: Some(true),
449                    facetable: Some(false),
450                    retrievable: Some(true),
451                },
452                IndexField {
453                    name: "last_crawled".to_string(),
454                    field_type: "Edm.DateTimeOffset".to_string(),
455                    key: Some(false),
456                    searchable: Some(false),
457                    filterable: Some(true),
458                    sortable: Some(true),
459                    facetable: Some(false),
460                    retrievable: Some(false),
461                },
462            ],
463        };
464
465        let create_url = format!(
466            "https://{}.search.windows.net/indexes?api-version={}",
467            self.config.azure.search_service_name, self.config.azure.search_api_version
468        );
469
470        debug!(
471            "🛠️ INDEX CREATE API: Sending create request to {}",
472            create_url
473        );
474        debug!(
475            "📋 INDEX SCHEMA: Creating index with {} fields",
476            schema.fields.len()
477        );
478
479        let response = self
480            .client
481            .post(&create_url)
482            .header("api-key", &self.config.azure.search_api_key)
483            .header("Content-Type", "application/json")
484            .json(&schema)
485            .send()
486            .await
487            .context("Failed to create search index")?;
488
489        if !response.status().is_success() {
490            let status = response.status();
491            let error_text = response.text().await.unwrap_or_default();
492            error!(
493                "❌ INDEX CREATE FAILED: Failed to create index '{}' with status {} - {}",
494                self.config.azure.search_index_name, status, error_text
495            );
496            return Err(anyhow::anyhow!(
497                "Failed to create search index with status {}: {}",
498                status,
499                error_text
500            ));
501        }
502
503        info!(
504            "✅ INDEX CREATED: Successfully created search index '{}' with {} fields",
505            self.config.azure.search_index_name,
506            schema.fields.len()
507        );
508        Ok(())
509    }
510}
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515    use std::sync::Arc;
516
517    fn create_test_config() -> Arc<Config> {
518        Arc::new(Config {
519            environment: "test".to_string(),
520            azure: crate::config::AzureConfig {
521                search_service_name: "test-service".to_string(),
522                search_api_key: "test-key".to_string(),
523                search_api_version: "2023-11-01".to_string(),
524                search_index_name: "test-index".to_string(),
525                cosmos_endpoint: "test".to_string(),
526                cosmos_key: "test".to_string(),
527                cosmos_database_name: "test".to_string(),
528                cosmos_container_name: "test".to_string(),
529            },
530            application: crate::config::ApplicationConfig {
531                max_crawl_depth: 5,
532                crawl_delay_ms: 1000,
533                max_concurrent_requests: 10,
534                user_agent: "test".to_string(),
535                allowed_domains: vec![],
536                periodic_index_interval_days: 7,
537                duplicate_removal_interval_hours: 24,
538                admin_api_key: "test".to_string(),
539            },
540        })
541    }
542
543    #[test]
544    fn test_search_url_construction() {
545        let config = create_test_config();
546
547        // Test that the URL is correctly constructed for search queries
548        let expected_base = "https://test-service.search.windows.net/indexes/test-index/docs";
549        assert_eq!(config.search_query_url(), expected_base);
550
551        // Test URL encoding of common characters
552        let test_query = "web development tutorial";
553        let encoded = test_query.replace(" ", "%20").replace("+", "%2B");
554        assert_eq!(encoded, "web%20development%20tutorial");
555
556        let test_query_with_plus = "C++ programming";
557        let encoded_plus = test_query_with_plus.replace(" ", "%20").replace("+", "%2B");
558        assert_eq!(encoded_plus, "C%2B%2B%20programming");
559    }
560
561    #[test]
562    fn test_search_request_structure_removed() {
563        // Verify that we're no longer using SearchRequest for JSON body
564        // This test validates that our change from POST+JSON to GET+query params is working
565
566        // The SearchRequest struct should still exist for potential future use
567        // but we should not be using it in the search method anymore
568        let search_request = SearchRequest {
569            search: "test".to_string(),
570            top: 10,
571            skip: 0,
572            select: Some("id,title".to_string()),
573            highlight: Some("content".to_string()),
574            highlight_pre_tag: Some("<mark>".to_string()),
575            highlight_post_tag: Some("</mark>".to_string()),
576        };
577
578        // The struct should still be valid for serialization if needed
579        assert_eq!(search_request.search, "test");
580        assert_eq!(search_request.top, 10);
581        assert_eq!(search_request.skip, 0);
582    }
583}