Source code for toolregistry.hub.websearch.websearch_searxng

from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional

import httpx
from loguru import logger

from .websearch import HEADERS_DEFAULT, TIMEOUT_DEFAULT, WebSearchGeneral


class _WebSearchEntrySearxNG(dict):
    """Search result entry model with type validation."""

    def __init__(self, **data):
        super().__init__(**data)

    content: str
    thumbnail: Optional[str] = None
    engine: str
    template: str
    parsed_url: List[str]
    img_src: Optional[str] = None
    priority: Optional[str] = None
    engines: List[str]
    positions: List[int]
    score: float
    category: str


[docs] class WebSearchSearxng(WebSearchGeneral): """WebSearchSearxng provides a unified interface for performing web searches and processing results through a SearxNG instance. It handles search queries, result filtering, and content extraction. Features: - Performs web searches using SearxNG instance - Filters results by relevance score threshold - Extracts and cleans webpage content using multiple methods (BeautifulSoup/Jina Reader) - Parallel processing of result fetching - Automatic emoji removal and text normalization Examples: >>> from toolregistry.hub.websearch_searxng import WebSearchSearxng >>> searcher = WebSearchSearxng("http://localhost:8080") >>> results = searcher.search("python web scraping", number_results=3) >>> for result in results: ... print(result["title"]) """
[docs] def __init__( self, searxng_base_url: str, proxy: Optional[str] = None, ): """Initialize WebSearchSearxng with configuration parameters. Args: searxng_base_url (str): Base URL for the SearxNG instance (e.g. "http://localhost:8080"). proxy (Optional[str]): Proxy URL for HTTP requests. """ self.searxng_base_url: str = searxng_base_url.rstrip("/") if not self.searxng_base_url.endswith("/search"): self.searxng_base_url += "/search" # Ensure the URL ends with /search self.proxy: Optional[str] = proxy if proxy else None
[docs] def search( self, query: str, number_results: int = 5, threshold: float = 0.2, timeout: Optional[float] = None, ) -> List[Dict[str, str]]: """Perform search and return results. Args: query (str): The search query. Boolean operators like AND, OR, NOT can be used if needed. number_results (int, optional): The maximum number of results to return. Defaults to 5. threshold (float, optional): Minimum score threshold for results [0-1.0]. Defaults to 0.2. timeout (float, optional): Request timeout in seconds. Defaults to TIMEOUT_DEFAULT (10). Usually not needed. Returns: List[Dict[str, str]]: A list of enriched search results. Each dictionary contains: - 'title': The title of the search result. - 'url': The URL of the search result. - 'content': The content of the search result. - 'excerpt': The excerpt of the search result. """ try: results = self._meta_search_searxng( query, num_results=number_results, proxy=self.proxy, timeout=timeout, searxng_base_url=self.searxng_base_url, ) filtered_results = [ entry for entry in results if entry.get("score", 0) >= threshold ] with ProcessPoolExecutor() as executor: enriched_results = list( executor.map( self._fetch_webpage_content, filtered_results, ) ) return enriched_results except httpx.RequestError as e: logger.debug(f"Request error: {e}") return [] except httpx.HTTPStatusError as e: logger.debug(f"HTTP error: {e.response.status_code}") return []
@staticmethod def _meta_search_searxng( query, num_results=10, proxy: Optional[str] = None, timeout: Optional[float] = 5, searxng_base_url: str = "http://localhost:8080/search", ) -> List[_WebSearchEntrySearxNG]: """ Perform a search using SearXNG and return the results. """ response = httpx.get( searxng_base_url, params={ "q": query, "format": "json", }, headers=HEADERS_DEFAULT, proxy=proxy, timeout=timeout or TIMEOUT_DEFAULT, ) response.raise_for_status() results = response.json().get("results", []) results = sorted(results, key=lambda x: x.get("score", 0), reverse=True) if len(results) > num_results: # Fetch additional results if needed results = results[:num_results] return results
if __name__ == "__main__": import json import os SEARXNG_URL = os.getenv("SEARXNG_URL", "http://localhost:8080") search_tool = WebSearchSearxng(SEARXNG_URL) results = search_tool.search("Barcelona weather today", 5) for result in results: print(json.dumps(result, indent=2, ensure_ascii=False)) print(search_tool.extract(results[0]["url"]))