Struct StorageService

Source
pub struct StorageService {
    client: Client,
    cosmos_client: Option<CosmosClient>,
    config: Arc<Config>,
}
Expand description

Storage service for Azure CosmosDB operations.

Provides methods for storing and retrieving web pages, managing crawl queues, and tracking search statistics. Migrating from REST API to official Azure SDK.

Fields§

§client: Client

HTTP client for CosmosDB requests (legacy, to be removed)

§cosmos_client: Option<CosmosClient>

Azure Cosmos DB client (official SDK)

§config: Arc<Config>

Application configuration

Implementations§

Source§

impl StorageService

Source

pub async fn new(config: Arc<Config>) -> Result<Self>

Creates a new StorageService instance.

Initializes the HTTP client and Cosmos SDK client, then ensures that the required CosmosDB database and containers exist.

§Arguments
  • config - Application configuration containing CosmosDB connection details
§Returns

A new StorageService instance ready for use.

§Errors

Returns an error if:

  • HTTP client creation fails
  • Cosmos SDK client creation fails
  • Database or container initialization fails
  • CosmosDB connection cannot be established
Source

fn create_cosmos_client(config: &Config) -> Result<CosmosClient>

Create Azure Cosmos DB SDK client with master key authentication

Source

pub async fn store_webpage(&self, webpage: &WebPage) -> Result<()>

Source

async fn store_webpage_rest_api(&self, webpage: &WebPage) -> Result<()>

Source

pub async fn get_webpage( &self, id: &str, domain: &str, ) -> Result<Option<WebPage>>

Source

async fn get_webpage_rest_api( &self, id: &str, domain: &str, ) -> Result<Option<WebPage>>

Source

pub async fn queue_crawl(&self, crawl_item: &CrawlQueue) -> Result<()>

Source

async fn queue_crawl_rest_api(&self, crawl_item: &CrawlQueue) -> Result<()>

Source

pub async fn get_pending_crawl_items( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>

Source

async fn get_pending_crawl_items_sdk_query( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>

Source

async fn create_root_domain_crawl_items( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>

Source

fn url_to_id(url: &str) -> String

Source

async fn get_pending_crawl_items_rest_api( &self, limit: usize, ) -> Result<Vec<CrawlQueue>>

Source

pub async fn update_crawl_status( &self, id: &str, domain: &str, status: CrawlStatus, error_message: Option<String>, ) -> Result<()>

Source

async fn update_crawl_status_rest_api( &self, id: &str, domain: &str, status: CrawlStatus, error_message: Option<String>, ) -> Result<()>

Source

pub async fn get_domain_last_indexed( &self, domain: &str, ) -> Result<Option<DateTime<Utc>>>

Get the last indexed time for a specific domain Returns the most recent last_crawled timestamp for any page in the domain

Source

pub async fn store_search_statistic( &self, statistic: &SearchStatistic, ) -> Result<()>

Store search statistics for administrative analytics

Source

pub async fn get_recent_search_statistics( &self, limit: usize, ) -> Result<Vec<SearchStatistic>>

Get recent search statistics for administrative purposes

Source

pub async fn get_top_search_queries( &self, limit: usize, ) -> Result<Vec<(String, usize)>>

Get top search queries by frequency

Source

pub async fn get_crawl_item( &self, id: &str, domain: &str, ) -> Result<Option<CrawlQueue>>

Source

async fn get_crawl_item_rest_api( &self, id: &str, domain: &str, ) -> Result<Option<CrawlQueue>>

Source

async fn ensure_database_exists(&self) -> Result<()>

Source

async fn ensure_database_exists_rest_api(&self) -> Result<()>

Source

async fn ensure_containers_exist(&self) -> Result<()>

Source

async fn create_container( &self, container_name: &str, partition_key: &str, ) -> Result<()>

Source

async fn create_container_rest_api( &self, container_name: &str, partition_key: &str, ) -> Result<()>

Source

fn get_rfc1123_date() -> String

Generate RFC 1123 formatted date string

Source

fn generate_cosmos_signature( verb: &str, resource_type: &str, resource_id: &str, date: &str, master_key: &str, ) -> Result<String>

Generate Azure Cosmos DB authorization signature

Source

fn cosmos_auth_headers( &self, verb: &str, resource_type: &str, resource_id: &str, ) -> Result<(String, String)>

Generate Azure Cosmos DB authorization header and date

Source

pub async fn get_crawl_queue_stats( &self, ) -> Result<(usize, usize, usize, usize)>

Get crawl queue status statistics for monitoring and logging

Returns counts of crawl items by status.

§Returns

A tuple containing (pending_count, processing_count, completed_count, failed_count)

Source

async fn get_crawl_queue_stats_sdk( &self, ) -> Result<(usize, usize, usize, usize)>

Source

pub fn get_recent_logs(&self, limit: usize) -> Vec<LogEntry>

Get recent application logs for display in the dashboard

Returns recent log entries captured by the application

Source

async fn get_domain_partition_stats( &self, domain: &str, sample_limit: usize, ) -> Result<(usize, usize, usize, usize)>

Source

pub async fn remove_duplicates(&self) -> Result<usize>

Remove duplicate entries from the crawl queue and web pages collections

This method identifies and removes duplicates based on:

  1. Multiple crawl queue entries with the same URL
  2. Multiple web page entries with the same URL
Source

async fn remove_crawl_queue_duplicates(&self) -> Result<usize>

Remove duplicate crawl queue entries

Source

async fn remove_webpage_duplicates(&self) -> Result<usize>

Remove duplicate web page entries

Source

async fn query_crawl_queue_duplicates(&self, query: &str) -> Result<Vec<String>>

Query for URLs that have duplicates in the crawl queue

Source

async fn query_webpage_duplicates(&self, query: &str) -> Result<Vec<String>>

Query for URLs that have duplicates in the web pages collection

Source

async fn get_crawl_queue_entries_by_url( &self, url: &str, ) -> Result<Vec<CrawlQueue>>

Get all crawl queue entries for a specific URL

Source

async fn get_webpage_entries_by_url(&self, url: &str) -> Result<Vec<WebPage>>

Get all webpage entries for a specific URL

Source

async fn delete_crawl_queue_entry(&self, id: &str, domain: &str) -> Result<()>

Delete a crawl queue entry by ID and domain

Source

async fn delete_webpage_entry(&self, id: &str, domain: &str) -> Result<()>

Delete a webpage entry by ID and domain

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

§

impl<T> Instrument for T

§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided [Span], returning an Instrumented wrapper. Read more
§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

§

impl<T> PolicyExt for T
where T: ?Sized,

§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns [Action::Follow] only if self and other return Action::Follow. Read more
§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns [Action::Follow] if either self or other returns Action::Follow. Read more
Source§

impl<T> Same for T

Source§

type Output = T

Should always be Self
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

§

fn vzip(self) -> V

§

impl<T> WithSubscriber for T

§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a [WithDispatch] wrapper. Read more
§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a [WithDispatch] wrapper. Read more
§

impl<T> ErasedDestructor for T
where T: 'static,

§

impl<A, B, T> HttpServerConnExec<A, B> for T
where B: Body,