search_engine_backend/
lib.rs

1//! # Search Engine Backend
2//!
3//! A Rust-based search engine backend that integrates with Azure Cognitive Search and CosmosDB
4//! to provide web crawling, indexing, and search capabilities.
5//!
6//! ## Features
7//!
8//! - **Web Search**: Full-text search across indexed web content
9//! - **Domain Indexing**: Crawl and index web domains with robots.txt compliance
10//! - **Search Analytics**: Track search queries, performance metrics, and usage patterns
11//! - **Azure Integration**: Uses Azure Cognitive Search and CosmosDB for scalable storage
12//! - **REST API**: Clean HTTP API for all operations
13//!
14//! ## Architecture
15//!
16//! The application is structured around several key services:
17//!
18//! - [`SearchService`]: Handles search operations via Azure Cognitive Search
19//! - [`StorageService`]: Manages data persistence in Azure CosmosDB
20//! - [`IndexerService`]: Orchestrates web crawling and content indexing
21//! - [`Config`]: Application configuration management
22//!
23//! ## Example Usage
24//!
25//! ```rust,no_run
26//! use search_engine_backend::{Config, AppState, create_router, StorageService, SearchService, IndexerService};
27//! use std::sync::Arc;
28//!
29//! #[tokio::main]
30//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
31//!     // Load configuration
32//!     let config = Arc::new(Config::from_env()?);
33//!     
34//!     // Initialize services
35//!     let storage_service = Arc::new(StorageService::new(config.clone()).await?);
36//!     let search_service = Arc::new(SearchService::new(config.clone()).await?);
37//!     let indexer_service = Arc::new(IndexerService::new(
38//!         config.clone(),
39//!         storage_service.clone(),
40//!         search_service.clone()
41//!     ).await?);
42//!     
43//!     // Create application state
44//!     let app_state = AppState {
45//!         config,
46//!         search_service,
47//!         storage_service,
48//!         indexer_service,
49//!     };
50//!     
51//!     // Create router and start server
52//!     let app = create_router(app_state);
53//!     let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await?;
54//!     axum::serve(listener, app).await?;
55//!     
56//!     Ok(())
57//! }
58//! ```
59
60use anyhow::Result;
61use axum::{
62    extract::{Query, State},
63    http::{HeaderMap, StatusCode},
64    response::{Html, Json},
65    routing::{get, post},
66    Router,
67};
68use chrono::Utc;
69use clap::Parser;
70use serde::{Deserialize, Serialize};
71use std::sync::Arc;
72use tracing::{info, warn};
73
74mod config;
75mod indexer;
76mod search;
77mod storage;
78
79pub use config::Config;
80pub use indexer::IndexerService;
81pub use search::SearchService;
82pub use storage::{SearchStatistic, StorageService};
83
84/// Command-line arguments for the Search Engine Backend application.
85///
86/// Supports configuration of server port and optional config file path.
87#[derive(Parser)]
88#[command(name = "search-backend")]
89#[command(about = "A Rust search engine backend using Azure services")]
90pub struct Args {
91    /// Port number to run the HTTP server on
92    #[arg(short, long, default_value = "3000")]
93    pub port: u16,
94
95    /// Optional path to configuration file
96    #[arg(short, long)]
97    config: Option<String>,
98}
99
100/// Application state containing all service instances and configuration.
101///
102/// This struct holds Arc-wrapped instances of all services to enable
103/// safe sharing across async tasks and HTTP handlers.
104#[derive(Clone)]
105pub struct AppState {
106    /// Application configuration
107    pub config: Arc<Config>,
108    /// Search service for Azure Cognitive Search operations
109    pub search_service: Arc<SearchService>,
110    /// Storage service for CosmosDB operations
111    pub storage_service: Arc<StorageService>,
112    /// Indexer service for web crawling and content indexing
113    pub indexer_service: Arc<IndexerService>,
114}
115
116/// Query parameters for search requests.
117///
118/// Used to parse URL query parameters for the search endpoint.
119#[derive(Deserialize)]
120pub struct SearchQuery {
121    /// The search query string
122    q: String,
123    /// Maximum number of results to return (default: 10)
124    #[serde(default = "default_limit")]
125    limit: usize,
126    /// Number of results to skip for pagination (default: 0)
127    #[serde(default)]
128    offset: usize,
129}
130
131/// Default limit for search results when not specified.
132fn default_limit() -> usize {
133    10
134}
135
136/// Response structure for search operations.
137///
138/// Contains search results along with metadata about the search operation.
139#[derive(Serialize)]
140pub struct SearchResponse {
141    /// The original search query
142    query: String,
143    /// Array of search results
144    results: Vec<SearchResult>,
145    /// Total number of results found
146    total_count: usize,
147    /// Time taken to process the search in milliseconds
148    took_ms: u64,
149}
150
151/// Individual search result item.
152///
153/// Represents a single document found in the search index.
154#[derive(Serialize, Clone)]
155pub struct SearchResult {
156    /// Unique identifier for the document
157    pub id: String,
158    /// Title of the document/page
159    pub title: String,
160    /// Original URL of the document
161    pub url: String,
162    /// Relevant excerpt from the document content
163    pub snippet: String,
164    /// Search relevance score (0.0 to 1.0)
165    pub score: f64,
166    /// Timestamp when the document was indexed
167    pub indexed_at: chrono::DateTime<chrono::Utc>,
168}
169
170/// Request structure for indexing operations.
171///
172/// Contains a list of domains to be crawled and indexed.
173#[derive(Deserialize)]
174pub struct IndexRequest {
175    /// Array of domain names to crawl and index
176    domains: Vec<String>,
177}
178
179/// Response structure for indexing operations.
180///
181/// Confirms successful queuing of domains for indexing.
182#[derive(Serialize)]
183pub struct IndexResponse {
184    /// Human-readable success message
185    message: String,
186    /// Number of domains successfully queued
187    domains_queued: usize,
188}
189
190/// Response structure for force indexing operations.
191///
192/// Confirms successful force indexing initiation.
193#[derive(Serialize)]
194pub struct ForceIndexResponse {
195    /// Human-readable success message
196    message: String,
197    /// Number of domains queued for immediate indexing
198    domains_queued: usize,
199    /// Whether periodic indexing timer was reset
200    timer_reset: bool,
201}
202
203/// Response structure for force queue processing operations.
204///
205/// Confirms successful force queue processing initiation.
206#[derive(Serialize)]
207pub struct ForceProcessQueueResponse {
208    /// Human-readable success message
209    message: String,
210    /// Whether queue processing was triggered
211    triggered: bool,
212}
213
214/// Response structure for search statistics.
215///
216/// Contains recent search operations and analytics data.
217#[derive(Serialize)]
218pub struct StatsResponse {
219    /// Array of recent search statistics
220    recent_searches: Vec<SearchStatistic>,
221    /// Total number of statistics returned
222    total_count: usize,
223}
224
225/// Response structure for top search queries.
226///
227/// Contains the most frequently searched terms.
228#[derive(Serialize)]
229pub struct TopQueriesResponse {
230    /// Array of top search queries with frequencies
231    top_queries: Vec<TopQuery>,
232}
233
234/// Individual top query item.
235///
236/// Represents a frequently searched query with its frequency count.
237#[derive(Serialize)]
238pub struct TopQuery {
239    /// The search query (normalized)
240    query: String,
241    /// Number of times this query was searched
242    frequency: usize,
243}
244
245/// Validates admin authentication from request headers.
246///
247/// Checks the X-Admin-Key header against the configured admin API key.
248///
249/// # Arguments
250/// * `headers` - HTTP request headers
251/// * `config` - Application configuration containing the admin API key
252///
253/// # Returns
254/// `true` if authentication is valid, `false` otherwise
255fn validate_admin_auth(headers: &HeaderMap, config: &Config) -> bool {
256    if let Some(auth_header) = headers.get("X-Admin-Key") {
257        if let Ok(provided_key) = auth_header.to_str() {
258            return provided_key == config.application.admin_api_key;
259        }
260    }
261    false
262}
263
264/// HTTP handler for search operations.
265///
266/// Processes search queries and returns matching documents from the search index.
267/// Search statistics are stored asynchronously to avoid blocking the response.
268///
269/// # Query Parameters
270/// - `q`: Search query string (required)
271/// - `limit`: Maximum number of results (optional, default: 10)
272/// - `offset`: Number of results to skip for pagination (optional, default: 0)
273///
274/// # Returns
275/// - `200 OK`: JSON response with search results and metadata
276/// - `500 Internal Server Error`: If search operation fails
277///
278/// # Example
279/// ```text
280/// GET /search?q=rust+programming&limit=5&offset=0
281/// ```
282pub async fn search_handler(
283    Query(params): Query<SearchQuery>,
284    State(state): State<AppState>,
285) -> Result<Json<SearchResponse>, StatusCode> {
286    let start = std::time::Instant::now();
287
288    crate::log_and_capture!(
289        info,
290        "🔍 SEARCH: Searching for '{}' (limit: {}, offset: {})",
291        params.q,
292        params.limit,
293        params.offset
294    );
295
296    match state
297        .search_service
298        .search(&params.q, params.limit, params.offset)
299        .await
300    {
301        Ok(results) => {
302            let took_ms = start.elapsed().as_millis() as u64;
303            let response = SearchResponse {
304                query: params.q.clone(),
305                total_count: results.len(),
306                results,
307                took_ms,
308            };
309
310            // Log search completion
311            crate::log_and_capture!(
312                info,
313                "✅ SEARCH: Found {} results for '{}' in {}ms",
314                response.total_count,
315                params.q,
316                took_ms
317            );
318
319            // Store search statistics asynchronously (don't block response)
320            let statistic = SearchStatistic {
321                id: uuid::Uuid::new_v4().to_string(),
322                query: params.q.clone(),
323                query_normalized: params.q.trim().to_lowercase(),
324                result_count: response.total_count,
325                search_time_ms: took_ms,
326                timestamp: chrono::Utc::now(),
327                user_ip: None, // TODO: Extract from request headers if needed
328            };
329
330            let storage_service = state.storage_service.clone();
331            tokio::spawn(async move {
332                if let Err(e) = storage_service.store_search_statistic(&statistic).await {
333                    warn!("Failed to store search statistic: {}", e);
334                }
335            });
336
337            Ok(Json(response))
338        }
339        Err(e) => {
340            warn!("Search failed: {}", e);
341            Err(StatusCode::INTERNAL_SERVER_ERROR)
342        }
343    }
344}
345
346/// HTTP handler for domain indexing operations.
347///
348/// Queues one or more domains for crawling and indexing. The indexing
349/// process runs asynchronously in the background.
350///
351/// # Request Body
352/// JSON object containing an array of domain names:
353/// ```json
354/// {
355///   "domains": ["example.com", "another-site.org"]
356/// }
357/// ```
358///
359/// # Returns
360/// - `200 OK`: JSON response confirming domains were queued
361/// - `500 Internal Server Error`: If queueing operation fails
362///
363/// # Example
364/// ```text
365/// POST /index
366/// Content-Type: application/json
367///
368/// {"domains": ["example.com"]}
369/// ```
370pub async fn index_handler(
371    State(state): State<AppState>,
372    Json(payload): Json<IndexRequest>,
373) -> Result<Json<IndexResponse>, StatusCode> {
374    info!("Indexing request for {} domains", payload.domains.len());
375
376    match state.indexer_service.queue_domains(&payload.domains).await {
377        Ok(count) => {
378            let response = IndexResponse {
379                message: format!("Successfully queued {} domains for indexing", count),
380                domains_queued: count,
381            };
382            Ok(Json(response))
383        }
384        Err(e) => {
385            warn!("Indexing failed: {}", e);
386            Err(StatusCode::INTERNAL_SERVER_ERROR)
387        }
388    }
389}
390
391/// HTTP handler for force indexing operations.
392///
393/// Forces immediate indexing of all allowed domains, bypassing the normal
394/// time-based checks. This endpoint requires admin authentication via the
395/// X-Admin-Key header. Individual domain failures are logged but do not
396/// cause the entire operation to fail.
397///
398/// # Headers
399/// - `X-Admin-Key`: Required admin API key for authentication
400///
401/// # Returns
402/// - `200 OK`: JSON response confirming force indexing was initiated
403/// - `401 Unauthorized`: If admin authentication fails
404///
405/// # Example
406/// ```text
407/// POST /admin/force-index
408/// X-Admin-Key: your-admin-key
409/// ```
410pub async fn admin_force_index_handler(
411    headers: HeaderMap,
412    State(state): State<AppState>,
413) -> Result<Json<ForceIndexResponse>, StatusCode> {
414    // Validate admin authentication
415    if !validate_admin_auth(&headers, &state.config) {
416        warn!("Unauthorized force index attempt - invalid or missing admin key");
417        return Err(StatusCode::UNAUTHORIZED);
418    }
419
420    info!("🔐 Admin force index request authenticated successfully");
421    info!("🚀 Initiating force indexing of all allowed domains");
422
423    // Force index all allowed domains without time checks
424    let allowed_domains = &state.config.application.allowed_domains;
425
426    // Always return success even if some/all domains fail to queue
427    let count = match state
428        .indexer_service
429        .queue_domains_with_check(allowed_domains, false)
430        .await
431    {
432        Ok(count) => {
433            info!(
434                "✅ Force indexing initiated: {} domains queued for immediate processing",
435                count
436            );
437            count
438        }
439        Err(e) => {
440            // Log the systemic error but don't fail the request
441            warn!(
442                "âš ī¸ Force indexing encountered issues: {} - but individual domain results may vary",
443                e
444            );
445            info!("â„šī¸ Some domains may have been queued successfully despite the error");
446            0 // Return 0 as a safe fallback, actual successful domains may be higher
447        }
448    };
449
450    let response = ForceIndexResponse {
451        message: format!(
452            "Force indexing initiated: {} domains queued for immediate processing",
453            count
454        ),
455        domains_queued: count,
456        timer_reset: true, // We're effectively resetting by forcing immediate processing
457    };
458    Ok(Json(response))
459}
460
461/// HTTP handler for force queue processing operations.
462///
463/// Forces immediate processing of the crawl queue, bypassing the normal
464/// sleep period when no items are pending. This endpoint requires admin
465/// authentication via the X-Admin-Key header.
466///
467/// # Headers
468/// - `X-Admin-Key`: Required admin API key for authentication
469///
470/// # Returns
471/// - `200 OK`: JSON response confirming force queue processing was triggered
472/// - `401 Unauthorized`: If admin authentication fails
473/// - `500 Internal Server Error`: If triggering fails
474///
475/// # Example
476/// ```text
477/// POST /admin/force-process-queue
478/// X-Admin-Key: your-admin-key
479/// ```
480pub async fn admin_force_process_queue_handler(
481    headers: HeaderMap,
482    State(state): State<AppState>,
483) -> Result<Json<ForceProcessQueueResponse>, StatusCode> {
484    // Validate admin authentication
485    if !validate_admin_auth(&headers, &state.config) {
486        warn!("Unauthorized force process queue attempt - invalid or missing admin key");
487        return Err(StatusCode::UNAUTHORIZED);
488    }
489
490    info!("🔐 Admin force process queue request authenticated successfully");
491    info!("🚀 Triggering immediate queue processing");
492
493    // Trigger immediate queue processing
494    match state.indexer_service.trigger_force_process_queue() {
495        Ok(()) => {
496            info!("✅ Force queue processing triggered successfully");
497            let response = ForceProcessQueueResponse {
498                message: "Force queue processing triggered successfully".to_string(),
499                triggered: true,
500            };
501            Ok(Json(response))
502        }
503        Err(e) => {
504            warn!("Failed to trigger force queue processing: {}", e);
505            Err(StatusCode::INTERNAL_SERVER_ERROR)
506        }
507    }
508}
509
510/// HTTP handler for retrieving search statistics.
511///
512/// Returns recent search operations and analytics data for monitoring
513/// and analysis purposes. This endpoint requires admin authentication via the
514/// X-Admin-Key header.
515///
516/// # Headers
517/// - `X-Admin-Key`: Required admin API key for authentication
518///
519/// # Returns
520/// - `200 OK`: JSON response with recent search statistics
521/// - `401 Unauthorized`: If admin authentication fails
522/// - `500 Internal Server Error`: If statistics retrieval fails
523///
524/// # Response
525/// Returns up to 50 recent search operations with their metadata including
526/// query, result count, search time, and timestamp.
527///
528/// # Example
529/// ```text
530/// GET /admin/stats
531/// X-Admin-Key: your-admin-key
532/// ```
533pub async fn admin_stats_handler(
534    headers: HeaderMap,
535    State(state): State<AppState>,
536) -> Result<Json<StatsResponse>, StatusCode> {
537    // Validate admin authentication
538    if !validate_admin_auth(&headers, &state.config) {
539        warn!("Unauthorized admin stats attempt - invalid or missing admin key");
540        return Err(StatusCode::UNAUTHORIZED);
541    }
542
543    info!("🔐 Admin stats request authenticated successfully");
544
545    match state.storage_service.get_recent_search_statistics(50).await {
546        Ok(recent_searches) => {
547            info!(
548                "📊 Retrieved {} recent search statistics",
549                recent_searches.len()
550            );
551            let response = StatsResponse {
552                total_count: recent_searches.len(),
553                recent_searches,
554            };
555            Ok(Json(response))
556        }
557        Err(e) => {
558            warn!("Failed to get search statistics: {}", e);
559            Err(StatusCode::INTERNAL_SERVER_ERROR)
560        }
561    }
562}
563
564/// HTTP handler for retrieving top search queries.
565///
566/// Returns the most frequently searched terms for analytics and content
567/// optimization purposes. This endpoint requires admin authentication via the
568/// X-Admin-Key header.
569///
570/// # Headers
571/// - `X-Admin-Key`: Required admin API key for authentication
572///
573/// # Returns
574/// - `200 OK`: JSON response with top search queries and their frequencies
575/// - `401 Unauthorized`: If admin authentication fails
576/// - `500 Internal Server Error`: If query retrieval fails
577///
578/// # Response
579/// Returns up to 20 most frequently searched queries with their occurrence counts.
580/// Queries are normalized for proper aggregation (lowercased, trimmed).
581///
582/// # Example
583/// ```text
584/// GET /admin/top-queries
585/// X-Admin-Key: your-admin-key
586/// ```
587pub async fn admin_top_queries_handler(
588    headers: HeaderMap,
589    State(state): State<AppState>,
590) -> Result<Json<TopQueriesResponse>, StatusCode> {
591    // Validate admin authentication
592    if !validate_admin_auth(&headers, &state.config) {
593        warn!("Unauthorized admin top queries attempt - invalid or missing admin key");
594        return Err(StatusCode::UNAUTHORIZED);
595    }
596
597    info!("🔐 Admin top queries request authenticated successfully");
598
599    match state.storage_service.get_top_search_queries(20).await {
600        Ok(top_queries_data) => {
601            let top_queries: Vec<TopQuery> = top_queries_data
602                .into_iter()
603                .map(|(query, frequency)| TopQuery { query, frequency })
604                .collect();
605
606            info!("📈 Retrieved {} top search queries", top_queries.len());
607            let response = TopQueriesResponse { top_queries };
608            Ok(Json(response))
609        }
610        Err(e) => {
611            warn!("Failed to get top queries: {}", e);
612            Err(StatusCode::INTERNAL_SERVER_ERROR)
613        }
614    }
615}
616
617/// HTTP handler for health check operations.
618///
619/// Provides a simple health check endpoint for monitoring and load balancer
620/// health checks. Always returns a successful response with service status.
621///
622/// # Returns
623/// Always returns `200 OK` with JSON containing:
624/// - Service status ("healthy")
625/// - Current timestamp
626/// - Service name
627///
628/// # Example
629/// ```text
630/// GET /health
631/// ```
632pub async fn health_handler() -> Json<serde_json::Value> {
633    Json(serde_json::json!({
634        "status": "healthy",
635        "timestamp": chrono::Utc::now(),
636        "service": "search-engine-backend"
637    }))
638}
639
640/// Generates a system activity log for the statistics dashboard.
641///
642/// Creates a log of recent system activities based on crawl queue status, search activity, and captured logs.
643fn generate_system_activity_log(
644    current_time: &chrono::DateTime<chrono::Utc>,
645    pending: usize,
646    processing: usize,
647    completed: usize,
648    failed: usize,
649    recent_searches: &[SearchStatistic],
650    recent_logs: &[crate::storage::LogEntry],
651) -> String {
652    let mut log_entries = Vec::new();
653
654    // Add recent captured application logs first (most important)
655    for log in recent_logs.iter().take(8) {
656        log_entries.push(format!(
657            "<div class=\"activity-item\">[{}] {}: {}</div>",
658            log.timestamp.format("%H:%M:%S"),
659            log.level.to_uppercase(),
660            log.message
661        ));
662    }
663
664    // Add current status summary
665    log_entries.push(format!(
666        "<div class=\"activity-item\">[{}] System Status: {} pending, {} processing, {} completed, {} failed</div>",
667        current_time.format("%H:%M:%S"),
668        pending,
669        processing,
670        completed,
671        failed
672    ));
673
674    // Add queue activity status
675    if processing > 0 {
676        log_entries.push(format!(
677            "<div class=\"activity-item\">[{}] ⚡ Queue processor is actively processing {} items</div>",
678            current_time.format("%H:%M:%S"),
679            processing
680        ));
681    } else if pending > 0 {
682        log_entries.push(format!(
683            "<div class=\"activity-item\">[{}] âŗ {} items waiting in queue for processing</div>",
684            current_time.format("%H:%M:%S"),
685            pending
686        ));
687    } else {
688        log_entries.push(format!(
689            "<div class=\"activity-item\">[{}] ✅ Queue is idle - no pending items</div>",
690            current_time.format("%H:%M:%S")
691        ));
692    }
693
694    // Add search activity summary
695    if !recent_searches.is_empty() {
696        let avg_response_time = recent_searches
697            .iter()
698            .map(|s| s.search_time_ms)
699            .sum::<u64>()
700            / recent_searches.len() as u64;
701        let total_results = recent_searches
702            .iter()
703            .map(|s| s.result_count)
704            .sum::<usize>();
705        log_entries.push(format!(
706            "<div class=\"activity-item\">[{}] 🔍 {} recent searches, {} total results, {}ms avg response</div>",
707            current_time.format("%H:%M:%S"),
708            recent_searches.len(),
709            total_results,
710            avg_response_time
711        ));
712
713        // Add most recent search
714        if let Some(latest_search) = recent_searches.first() {
715            log_entries.push(format!(
716                "<div class=\"activity-item\">[{}] Last search: \"{}\" ({} results, {}ms)</div>",
717                latest_search.timestamp.format("%H:%M:%S"),
718                latest_search.query,
719                latest_search.result_count,
720                latest_search.search_time_ms
721            ));
722        }
723    } else {
724        log_entries.push(format!(
725            "<div class=\"activity-item\">[{}] 🔍 No recent search activity</div>",
726            current_time.format("%H:%M:%S")
727        ));
728    }
729
730    // Add system health indicators
731    if failed > 20 {
732        log_entries.push(format!(
733            "<div class=\"activity-item\">[{}] âš ī¸ High failure rate detected: {} failed items</div>",
734            current_time.format("%H:%M:%S"),
735            failed
736        ));
737    }
738
739    if completed > 0 {
740        let success_rate = (completed * 100) / (completed + failed);
741        log_entries.push(format!(
742            "<div class=\"activity-item\">[{}] 📊 Success rate: {}% ({}/{} successful)</div>",
743            current_time.format("%H:%M:%S"),
744            success_rate,
745            completed,
746            completed + failed
747        ));
748    }
749
750    // Keep only the most recent 10 entries to avoid overwhelming the display
751    log_entries
752        .into_iter()
753        .take(10)
754        .collect::<Vec<_>>()
755        .join("\n")
756}
757
758/// HTTP handler for the statistics dashboard page.
759///
760/// Displays a live status page showing current crawling activity, queue statistics,
761/// and search analytics. The page is designed with an old-school terminal aesthetic.
762/// This endpoint does not require authentication and serves as a public dashboard.
763///
764/// # Returns
765/// HTML page with live statistics dashboard
766///
767/// # Example
768/// ```text
769/// GET /
770/// ```
771pub async fn stats_page_handler(State(state): State<AppState>) -> Html<String> {
772    let current_time = Utc::now();
773
774    // Log dashboard access
775    crate::log_and_capture!(info, "📊 Dashboard accessed - generating live statistics");
776
777    // Gather statistics using the storage service (no REST API calls)
778    let (pending, processing, completed, failed) = state
779        .storage_service
780        .get_crawl_queue_stats()
781        .await
782        .unwrap_or((0, 0, 0, 0));
783
784    // Log the statistics retrieval
785    crate::log_and_capture!(
786        info,
787        "📈 Queue stats retrieved: {} pending, {} processing, {} completed, {} failed",
788        pending,
789        processing,
790        completed,
791        failed
792    );
793
794    let recent_searches = state
795        .storage_service
796        .get_recent_search_statistics(10)
797        .await
798        .unwrap_or_default();
799
800    // Get recent application logs for display
801    let recent_logs = state.storage_service.get_recent_logs(20);
802
803    // Generate the HTML page with embedded CSS for old-school look
804    let html = format!(
805        r#"
806<!DOCTYPE html>
807<html lang="en">
808<head>
809    <meta charset="UTF-8">
810    <meta name="viewport" content="width=device-width, initial-scale=1.0">
811    <title>Search Engine Backend - Live Statistics</title>
812    <style>
813        @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400;700&display=swap');
814        
815        body {{
816            background-color: #0a0a0a;
817            color: #00ff00;
818            font-family: 'Courier Prime', 'Courier New', monospace;
819            margin: 0;
820            padding: 20px;
821            line-height: 1.4;
822            min-height: 100vh;
823            background-image: 
824                radial-gradient(circle at 20% 50%, rgba(0, 255, 0, 0.1) 0%, transparent 50%),
825                radial-gradient(circle at 80% 20%, rgba(0, 255, 255, 0.1) 0%, transparent 50%),
826                radial-gradient(circle at 40% 80%, rgba(255, 255, 0, 0.1) 0%, transparent 50%);
827        }}
828        
829        .container {{
830            max-width: 1200px;
831            margin: 0 auto;
832            border: 2px solid #00ff00;
833            padding: 20px;
834            border-radius: 10px;
835            box-shadow: 0 0 20px #00ff00;
836        }}
837        
838        h1 {{
839            text-align: center;
840            color: #00ffff;
841            text-shadow: 0 0 10px #00ffff;
842            font-size: 2.5rem;
843            margin-bottom: 10px;
844            letter-spacing: 3px;
845        }}
846        
847        .subtitle {{
848            text-align: center;
849            color: #ffff00;
850            margin-bottom: 30px;
851            font-size: 1.2rem;
852        }}
853        
854        .grid {{
855            display: grid;
856            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
857            gap: 20px;
858            margin-bottom: 30px;
859        }}
860        
861        .panel {{
862            border: 1px solid #00ff00;
863            padding: 15px;
864            border-radius: 5px;
865            background-color: rgba(0, 255, 0, 0.05);
866        }}
867        
868        .panel h2 {{
869            color: #00ffff;
870            margin-top: 0;
871            font-size: 1.3rem;
872            text-shadow: 0 0 5px #00ffff;
873        }}
874        
875        .stat {{
876            display: flex;
877            justify-content: space-between;
878            margin: 8px 0;
879            padding: 5px;
880            border-bottom: 1px dotted #004400;
881        }}
882        
883        .stat-label {{
884            color: #cccccc;
885        }}
886        
887        .stat-value {{
888            color: #00ff00;
889            font-weight: bold;
890        }}
891        
892        .status-good {{ color: #00ff00; }}
893        .status-warning {{ color: #ffff00; }}
894        .status-error {{ color: #ff0000; }}
895        
896        .timestamp {{
897            text-align: center;
898            color: #888888;
899            margin-top: 20px;
900            font-size: 0.9rem;
901        }}
902        
903        .blink {{
904            animation: blink 1s linear infinite;
905        }}
906        
907        @keyframes blink {{
908            0%, 50% {{ opacity: 1; }}
909            51%, 100% {{ opacity: 0; }}
910        }}
911        
912        .activity-log {{
913            max-height: 300px;
914            overflow-y: auto;
915            background-color: rgba(0, 0, 0, 0.3);
916            padding: 10px;
917            border-radius: 5px;
918            font-size: 0.9rem;
919        }}
920        
921        .activity-item {{
922            margin: 5px 0;
923            padding: 3px 0;
924            border-bottom: 1px dotted #333333;
925        }}
926        
927        .refresh-notice {{
928            text-align: center;
929            color: #ffff00;
930            margin-top: 15px;
931            font-style: italic;
932        }}
933    </style>
934    <script>
935        // Auto-refresh every 30 seconds
936        setTimeout(function() {{
937            window.location.reload();
938        }}, 30000);
939    </script>
940</head>
941<body>
942    <div class="container">
943        <h1>🚀 SEARCH ENGINE BACKEND</h1>
944        <div class="subtitle">Live System Statistics Dashboard</div>
945        
946        <div class="grid">
947            <div class="panel">
948                <h2>📊 Crawl Queue Status</h2>
949                <div class="stat">
950                    <span class="stat-label">âŗ Pending Items:</span>
951                    <span class="stat-value">{}</span>
952                </div>
953                <div class="stat">
954                    <span class="stat-label">⚡ Processing:</span>
955                    <span class="stat-value {}">{}</span>
956                </div>
957                <div class="stat">
958                    <span class="stat-label">✅ Completed:</span>
959                    <span class="stat-value status-good">{}</span>
960                </div>
961                <div class="stat">
962                    <span class="stat-label">❌ Failed:</span>
963                    <span class="stat-value {}">{}</span>
964                </div>
965                <div class="stat">
966                    <span class="stat-label">📈 Total Processed:</span>
967                    <span class="stat-value">{}</span>
968                </div>
969            </div>
970            
971            <div class="panel">
972                <h2>🔍 Search Activity</h2>
973                <div class="stat">
974                    <span class="stat-label">Recent Searches:</span>
975                    <span class="stat-value">{}</span>
976                </div>
977                <div class="activity-log">
978                    {}
979                </div>
980            </div>
981            
982            <div class="panel">
983                <h2>📋 System Activity Log</h2>
984                <div class="activity-log">
985                    {}
986                </div>
987            </div>
988            
989            <div class="panel">
990                <h2>đŸŽ¯ System Status</h2>
991                <div class="stat">
992                    <span class="stat-label">đŸŸĸ Backend Status:</span>
993                    <span class="stat-value status-good">OPERATIONAL <span class="blink">●</span></span>
994                </div>
995                <div class="stat">
996                    <span class="stat-label">🔄 Queue Processing:</span>
997                    <span class="stat-value {}">{}</span>
998                </div>
999                <div class="stat">
1000                    <span class="stat-label">📡 Search Engine:</span>
1001                    <span class="stat-value status-good">ONLINE</span>
1002                </div>
1003                <div class="stat">
1004                    <span class="stat-label">💾 Storage:</span>
1005                    <span class="stat-value status-good">CONNECTED</span>
1006                </div>
1007            </div>
1008            
1009            <div class="panel">
1010                <h2>📈 Performance Metrics</h2>
1011                <div class="stat">
1012                    <span class="stat-label">⚡ Avg Search Time:</span>
1013                    <span class="stat-value">{} ms</span>
1014                </div>
1015                <div class="stat">
1016                    <span class="stat-label">đŸŽ¯ Success Rate:</span>
1017                    <span class="stat-value">{}%</span>
1018                </div>
1019                <div class="stat">
1020                    <span class="stat-label">đŸ”Ĩ Queue Efficiency:</span>
1021                    <span class="stat-value">{}%</span>
1022                </div>
1023            </div>
1024        </div>
1025        
1026        <div class="refresh-notice">
1027            🔄 Auto-refreshing every 30 seconds | Last updated: {}
1028        </div>
1029        
1030        <div class="timestamp">
1031            System Time: {} UTC<br>
1032            Powered by Rust đŸĻ€ | Azure Cognitive Search | CosmosDB
1033        </div>
1034    </div>
1035</body>
1036</html>
1037"#,
1038        // Crawl queue values
1039        pending,
1040        if processing > 0 {
1041            "status-warning blink"
1042        } else {
1043            "status-good"
1044        },
1045        processing,
1046        completed,
1047        if failed > 10 {
1048            "status-error"
1049        } else if failed > 0 {
1050            "status-warning"
1051        } else {
1052            "status-good"
1053        },
1054        failed,
1055        completed + failed,
1056        // Search activity
1057        recent_searches.len(),
1058        // Recent searches log
1059        if recent_searches.is_empty() {
1060            "<div class=\"activity-item\">No recent search activity</div>".to_string()
1061        } else {
1062            recent_searches
1063                .iter()
1064                .take(8)
1065                .map(|search| {
1066                    format!(
1067                        "<div class=\"activity-item\">{} - \"{}\" ({} results, {}ms)</div>",
1068                        search.timestamp.format("%H:%M:%S"),
1069                        search.query,
1070                        search.result_count,
1071                        search.search_time_ms
1072                    )
1073                })
1074                .collect::<Vec<_>>()
1075                .join("\n")
1076        },
1077        // System activity log
1078        generate_system_activity_log(
1079            &current_time,
1080            pending,
1081            processing,
1082            completed,
1083            failed,
1084            &recent_searches,
1085            &recent_logs
1086        ),
1087        // Queue processing status
1088        if processing > 0 {
1089            "status-warning blink"
1090        } else {
1091            "status-good"
1092        },
1093        if processing > 0 { "ACTIVE" } else { "IDLE" },
1094        // Performance metrics
1095        if !recent_searches.is_empty() {
1096            recent_searches
1097                .iter()
1098                .map(|s| s.search_time_ms)
1099                .sum::<u64>()
1100                / recent_searches.len() as u64
1101        } else {
1102            0
1103        },
1104        // Success rate calculation
1105        if completed + failed > 0 {
1106            (completed * 100) / (completed + failed)
1107        } else {
1108            100
1109        },
1110        // Queue efficiency
1111        if completed + failed + pending > 0 {
1112            (completed * 100) / (completed + failed + pending)
1113        } else {
1114            100
1115        },
1116        // Timestamps
1117        current_time.format("%H:%M:%S"),
1118        current_time.format("%Y-%m-%d %H:%M:%S")
1119    );
1120
1121    Html(html)
1122}
1123
1124/// Creates the main application router with all endpoints configured.
1125///
1126/// Sets up all HTTP routes and their corresponding handlers with the provided
1127/// application state. The router includes:
1128///
1129/// - `GET /` - Statistics dashboard page (live status overview)
1130/// - `GET /health` - Health check endpoint
1131/// - `GET /search` - Search endpoint with query parameters
1132/// - `POST /index` - Domain indexing endpoint
1133/// - `POST /admin/force-index` - Force immediate indexing (requires admin auth)
1134/// - `GET /admin/stats` - Search statistics endpoint (requires admin auth)
1135/// - `GET /admin/top-queries` - Top queries analytics endpoint (requires admin auth)
1136///
1137/// # Arguments
1138/// * `state` - Application state containing all service instances
1139///
1140/// # Returns
1141/// Configured Axum router ready to be served
1142///
1143/// # Example
1144/// ```rust,no_run
1145/// use search_engine_backend::{AppState, create_router, Config, StorageService, SearchService, IndexerService};
1146/// use std::sync::Arc;
1147///
1148/// # #[tokio::main]
1149/// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
1150/// let config = Arc::new(Config::from_env()?);
1151/// let storage_service = Arc::new(StorageService::new(config.clone()).await?);
1152/// let search_service = Arc::new(SearchService::new(config.clone()).await?);
1153/// let indexer_service = Arc::new(IndexerService::new(
1154///     config.clone(),
1155///     storage_service.clone(),
1156///     search_service.clone()
1157/// ).await?);
1158///
1159/// let app_state = AppState {
1160///     config,
1161///     search_service,
1162///     storage_service,
1163///     indexer_service,
1164/// };
1165/// let router = create_router(app_state);
1166/// # Ok(())
1167/// # }
1168/// ```
1169pub fn create_router(state: AppState) -> Router {
1170    Router::new()
1171        .route("/", get(stats_page_handler))
1172        .route("/health", get(health_handler))
1173        .route("/search", get(search_handler))
1174        .route("/index", post(index_handler))
1175        .route("/admin/force-index", post(admin_force_index_handler))
1176        .route(
1177            "/admin/force-process-queue",
1178            post(admin_force_process_queue_handler),
1179        )
1180        .route("/admin/stats", get(admin_stats_handler))
1181        .route("/admin/top-queries", get(admin_top_queries_handler))
1182        .with_state(state)
1183}
1184
1185#[cfg(test)]
1186mod tests {
1187    use super::*;
1188
1189    #[test]
1190    fn test_top_query_structure() {
1191        let top_query = TopQuery {
1192            query: "rust programming".to_string(),
1193            frequency: 42,
1194        };
1195
1196        assert_eq!(top_query.query, "rust programming");
1197        assert_eq!(top_query.frequency, 42);
1198    }
1199
1200    #[test]
1201    fn test_stats_response_structure() {
1202        let stats_response = StatsResponse {
1203            recent_searches: vec![],
1204            total_count: 0,
1205        };
1206
1207        assert_eq!(stats_response.total_count, 0);
1208        assert!(stats_response.recent_searches.is_empty());
1209    }
1210
1211    #[test]
1212    fn test_force_process_queue_response_structure() {
1213        let response = ForceProcessQueueResponse {
1214            message: "Force queue processing triggered successfully".to_string(),
1215            triggered: true,
1216        };
1217
1218        assert_eq!(
1219            response.message,
1220            "Force queue processing triggered successfully"
1221        );
1222        assert!(response.triggered);
1223    }
1224
1225    #[test]
1226    fn test_stats_page_handler_returns_html() {
1227        // Test that the stats page handler returns valid HTML content
1228        // This is a simple test to verify the structure without setting up full infrastructure
1229
1230        // Since the handler uses external dependencies, we test the HTML template structure
1231        let html_template = r#"<!DOCTYPE html>
1232<html lang="en">
1233<head>
1234    <meta charset="UTF-8">
1235    <title>Search Engine Backend - Live Statistics</title>
1236</head>
1237<body>
1238    <div class="container">
1239        <h1>🚀 SEARCH ENGINE BACKEND</h1>
1240    </div>
1241</body>
1242</html>"#;
1243
1244        // Verify basic HTML structure
1245        assert!(html_template.contains("<!DOCTYPE html>"));
1246        assert!(html_template.contains("Search Engine Backend"));
1247        assert!(html_template.contains("Live Statistics"));
1248        assert!(html_template.contains("🚀 SEARCH ENGINE BACKEND"));
1249    }
1250
1251    #[test]
1252    fn test_statistics_page_content_structure() {
1253        // Test that our statistics page would contain the expected elements
1254        // This validates the HTML structure we're generating
1255
1256        let expected_elements = vec![
1257            "📊 Crawl Queue Status",
1258            "🔍 Search Activity",
1259            "đŸŽ¯ System Status",
1260            "📈 Performance Metrics",
1261            "âŗ Pending Items:",
1262            "⚡ Processing:",
1263            "✅ Completed:",
1264            "❌ Failed:",
1265            "đŸŸĸ Backend Status:",
1266            "Auto-refreshing every 30 seconds",
1267        ];
1268
1269        // These are the key elements that should appear in our statistics page
1270        for element in expected_elements {
1271            // In a real test, we'd verify these appear in the actual handler output
1272            assert!(!element.is_empty());
1273            assert!(
1274                element.contains("📊")
1275                    || element.contains("🔍")
1276                    || element.contains("đŸŽ¯")
1277                    || element.contains("📈")
1278                    || element.contains("âŗ")
1279                    || element.contains("⚡")
1280                    || element.contains("✅")
1281                    || element.contains("❌")
1282                    || element.contains("đŸŸĸ")
1283                    || element.contains("Auto")
1284            );
1285        }
1286    }
1287}