pub struct StorageService {
client: Client,
cosmos_client: Option<CosmosClient>,
config: Arc<Config>,
}
Expand description
Storage service for Azure CosmosDB operations.
Provides methods for storing and retrieving web pages, managing crawl queues, and tracking search statistics. Migrating from REST API to official Azure SDK.
Fields§
§client: Client
HTTP client for CosmosDB requests (legacy, to be removed)
cosmos_client: Option<CosmosClient>
Azure Cosmos DB client (official SDK)
config: Arc<Config>
Application configuration
Implementations§
Source§impl StorageService
impl StorageService
Sourcepub async fn new(config: Arc<Config>) -> Result<Self>
pub async fn new(config: Arc<Config>) -> Result<Self>
Creates a new StorageService instance.
Initializes the HTTP client and Cosmos SDK client, then ensures that the required CosmosDB database and containers exist.
§Arguments
config
- Application configuration containing CosmosDB connection details
§Returns
A new StorageService
instance ready for use.
§Errors
Returns an error if:
- HTTP client creation fails
- Cosmos SDK client creation fails
- Database or container initialization fails
- CosmosDB connection cannot be established
Sourcefn create_cosmos_client(config: &Config) -> Result<CosmosClient>
fn create_cosmos_client(config: &Config) -> Result<CosmosClient>
Create Azure Cosmos DB SDK client with master key authentication
pub async fn store_webpage(&self, webpage: &WebPage) -> Result<()>
async fn store_webpage_rest_api(&self, webpage: &WebPage) -> Result<()>
pub async fn get_webpage( &self, id: &str, domain: &str, ) -> Result<Option<WebPage>>
async fn get_webpage_rest_api( &self, id: &str, domain: &str, ) -> Result<Option<WebPage>>
pub async fn queue_crawl(&self, crawl_item: &CrawlQueue) -> Result<()>
async fn queue_crawl_rest_api(&self, crawl_item: &CrawlQueue) -> Result<()>
pub async fn get_pending_crawl_items( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>
async fn get_pending_crawl_items_sdk_query( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>
async fn create_root_domain_crawl_items( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>
fn url_to_id(url: &str) -> String
async fn get_pending_crawl_items_rest_api( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>
pub async fn update_crawl_status( &self, id: &str, domain: &str, status: CrawlStatus, error_message: Option<String>, ) -> Result<()>
async fn update_crawl_status_rest_api( &self, id: &str, domain: &str, status: CrawlStatus, error_message: Option<String>, ) -> Result<()>
Sourcepub async fn get_domain_last_indexed(
&self,
domain: &str,
) -> Result<Option<DateTime<Utc>>>
pub async fn get_domain_last_indexed( &self, domain: &str, ) -> Result<Option<DateTime<Utc>>>
Get the last indexed time for a specific domain Returns the most recent last_crawled timestamp for any page in the domain
Sourcepub async fn store_search_statistic(
&self,
statistic: &SearchStatistic,
) -> Result<()>
pub async fn store_search_statistic( &self, statistic: &SearchStatistic, ) -> Result<()>
Store search statistics for administrative analytics
Sourcepub async fn get_recent_search_statistics(
&self,
limit: usize,
) -> Result<Vec<SearchStatistic>>
pub async fn get_recent_search_statistics( &self, limit: usize, ) -> Result<Vec<SearchStatistic>>
Get recent search statistics for administrative purposes
Sourcepub async fn get_top_search_queries(
&self,
limit: usize,
) -> Result<Vec<(String, usize)>>
pub async fn get_top_search_queries( &self, limit: usize, ) -> Result<Vec<(String, usize)>>
Get top search queries by frequency
pub async fn get_crawl_item( &self, id: &str, domain: &str, ) -> Result<Option<CrawlQueue>>
async fn get_crawl_item_rest_api( &self, id: &str, domain: &str, ) -> Result<Option<CrawlQueue>>
async fn ensure_database_exists(&self) -> Result<()>
async fn ensure_database_exists_rest_api(&self) -> Result<()>
async fn ensure_containers_exist(&self) -> Result<()>
async fn create_container( &self, container_name: &str, partition_key: &str, ) -> Result<()>
async fn create_container_rest_api( &self, container_name: &str, partition_key: &str, ) -> Result<()>
Sourcefn get_rfc1123_date() -> String
fn get_rfc1123_date() -> String
Generate RFC 1123 formatted date string
Sourcefn generate_cosmos_signature(
verb: &str,
resource_type: &str,
resource_id: &str,
date: &str,
master_key: &str,
) -> Result<String>
fn generate_cosmos_signature( verb: &str, resource_type: &str, resource_id: &str, date: &str, master_key: &str, ) -> Result<String>
Generate Azure Cosmos DB authorization signature
Sourcefn cosmos_auth_headers(
&self,
verb: &str,
resource_type: &str,
resource_id: &str,
) -> Result<(String, String)>
fn cosmos_auth_headers( &self, verb: &str, resource_type: &str, resource_id: &str, ) -> Result<(String, String)>
Generate Azure Cosmos DB authorization header and date
Sourcepub async fn get_crawl_queue_stats(
&self,
) -> Result<(usize, usize, usize, usize)>
pub async fn get_crawl_queue_stats( &self, ) -> Result<(usize, usize, usize, usize)>
Get crawl queue status statistics for monitoring and logging
Returns counts of crawl items by status.
§Returns
A tuple containing (pending_count, processing_count, completed_count, failed_count)
async fn get_crawl_queue_stats_sdk( &self, ) -> Result<(usize, usize, usize, usize)>
Sourcepub fn get_recent_logs(&self, limit: usize) -> Vec<LogEntry>
pub fn get_recent_logs(&self, limit: usize) -> Vec<LogEntry>
Get recent application logs for display in the dashboard
Returns recent log entries captured by the application
async fn get_domain_partition_stats( &self, domain: &str, sample_limit: usize, ) -> Result<(usize, usize, usize, usize)>
Sourcepub async fn remove_duplicates(&self) -> Result<usize>
pub async fn remove_duplicates(&self) -> Result<usize>
Remove duplicate entries from the crawl queue and web pages collections
This method identifies and removes duplicates based on:
- Multiple crawl queue entries with the same URL
- Multiple web page entries with the same URL
Sourceasync fn remove_crawl_queue_duplicates(&self) -> Result<usize>
async fn remove_crawl_queue_duplicates(&self) -> Result<usize>
Remove duplicate crawl queue entries
Sourceasync fn remove_webpage_duplicates(&self) -> Result<usize>
async fn remove_webpage_duplicates(&self) -> Result<usize>
Remove duplicate web page entries
Sourceasync fn query_crawl_queue_duplicates(&self, query: &str) -> Result<Vec<String>>
async fn query_crawl_queue_duplicates(&self, query: &str) -> Result<Vec<String>>
Query for URLs that have duplicates in the crawl queue
Sourceasync fn query_webpage_duplicates(&self, query: &str) -> Result<Vec<String>>
async fn query_webpage_duplicates(&self, query: &str) -> Result<Vec<String>>
Query for URLs that have duplicates in the web pages collection
Sourceasync fn get_crawl_queue_entries_by_url(
&self,
url: &str,
) -> Result<Vec<CrawlQueue>>
async fn get_crawl_queue_entries_by_url( &self, url: &str, ) -> Result<Vec<CrawlQueue>>
Get all crawl queue entries for a specific URL
Sourceasync fn get_webpage_entries_by_url(&self, url: &str) -> Result<Vec<WebPage>>
async fn get_webpage_entries_by_url(&self, url: &str) -> Result<Vec<WebPage>>
Get all webpage entries for a specific URL