Source code for toolregistry.hub.websearch.websearch

import re
import sys
import unicodedata
from abc import ABC, abstractmethod
from typing import Literal, Optional

import httpx
from bs4 import BeautifulSoup
from loguru import logger

_UNABLE_TO_FETCH_CONTENT = "Unable to fetch content"
_UNABLE_TO_FETCH_TITLE = "Unable to fetch title"

if sys.version_info < (3, 9):
    HEADERS_DEFAULT = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
    }
else:
    from fake_useragent import UserAgent

    HEADERS_DEFAULT = {"User-Agent": (UserAgent(platforms="mobile").random)}
TIMEOUT_DEFAULT = 10.0



class _WebSearchEntryGeneral(dict):
    def __init__(self, **data):
        super().__init__(**data)

    url: str
    title: str
    content: str


[docs] class WebSearchGeneral(ABC):
[docs] @abstractmethod def search( self, query: str, number_results: int = 5, threshold: float = 0.2, timeout: Optional[float] = None, ) -> list: """Perform search and return results. Args: query (str): The search query. number_results (int, optional): The maximum number of results to return. Defaults to 5. threshold (float, optional): Minimum score threshold for results [0-1.0]. Defaults to 0.2. timeout (float, optional): Request timeout in seconds. Defaults to None. Returns: list: A list of search results. """ pass
[docs] @staticmethod def extract(url: str, timeout: Optional[float] = None) -> str: """Extract content from a given URL using available methods. Args: url (str): The URL to extract content from. timeout (float, optional): Request timeout in seconds. Defaults to TIMEOUT_DEFAULT (10). Usually not needed. Returns: str: Extracted content from the URL, or empty string if extraction fails. """ # First try BeautifulSoup method content = WebSearchGeneral._get_content_with_bs4( url, timeout=timeout or TIMEOUT_DEFAULT ) if not content: # Fallback to Jina Reader if BeautifulSoup fails content = WebSearchGeneral._get_content_with_jina_reader( url, timeout=timeout or TIMEOUT_DEFAULT ) formatted_content = ( WebSearchGeneral._format_text(content) if content else _UNABLE_TO_FETCH_CONTENT ) return formatted_content
@staticmethod def _fetch_webpage_content(entry: _WebSearchEntryGeneral) -> dict: """Retrieve complete webpage content from search result entry. Args: entry (_WebSearchEntryGeneral): The search result entry. Returns: Dict[str, str]: A dictionary containing the title, URL, content, and excerpt of the webpage. """ url = entry["url"] if not url: raise ValueError("Result missing URL") try: content = WebSearchGeneral.extract(url) except Exception as e: content = _UNABLE_TO_FETCH_CONTENT logger.debug(f"Error retrieving webpage content: {e}") return { "title": entry.get("title", _UNABLE_TO_FETCH_TITLE), "url": url, "content": content, "excerpt": entry.get("content", _UNABLE_TO_FETCH_CONTENT), } @staticmethod def _remove_emojis(text: str) -> str: """Remove emoji expressions from text. Args: text (str): The input text. Returns: str: Text with emojis removed. """ return "".join(c for c in text if not unicodedata.category(c).startswith("So")) @staticmethod def _format_text(text: str) -> str: """Format text content. Args: text (str): The input text. Returns: str: Formatted text. """ text = unicodedata.normalize("NFKC", text) text = re.sub(r"[^\S\n]+", " ", text) text = re.sub(r"\n+", "\n", text) text = text.strip() text = WebSearchGeneral._remove_emojis(text) return text @staticmethod def _get_content_with_jina_reader( url: str, return_format: Literal["markdown", "text", "html"] = "text", timeout: Optional[float] = None, ) -> str: """Fetch parsed content from Jina AI for a given URL. Args: url (str): The URL to fetch content from. return_format (Literal["markdown", "text", "html"], optional): The format of the returned content. Defaults to "text". timeout (Optional[float], optional): Timeout for the HTTP request. Defaults to TIMEOUT_DEFAULT. Returns: str: Parsed content from Jina AI. """ try: headers = { "X-Return-Format": return_format, "X-Remove-Selector": "header, .class, #id", "X-Target-Selector": "body, .class, #id", } jina_reader_url = "https://r.jina.ai/" response = httpx.get( jina_reader_url + url, headers=headers, timeout=timeout or TIMEOUT_DEFAULT, ) response.raise_for_status() return response.text except httpx.HTTPStatusError as e: logger.debug(f"HTTP Error [{e.response.status_code}]: {e}") return "" except Exception as e: logger.debug(f"Other error: {e}") return "" @staticmethod def _get_content_with_bs4( url: str, timeout: Optional[float] = None, ) -> str: """Utilizes BeautifulSoup to fetch and parse the content of a webpage. Args: url (str): The URL of the webpage. headers (Optional[Dict[str, str]]): HTTP headers to be sent with the request. Defaults to HEADERS_DEFAULT. timeout (Optional[float]): Timeout for the HTTP request. Defaults to TIMEOUT_DEFAULT. Returns: str: Parsed text content of the webpage. """ try: response = httpx.get( url, headers=HEADERS_DEFAULT, timeout=timeout or TIMEOUT_DEFAULT, ) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") for element in soup( ["script", "style", "nav", "footer", "iframe", "noscript"] ): element.decompose() main_content = ( soup.find("main") or soup.find("article") or soup.find("div", {"class": "content"}) ) content_source = main_content if main_content else soup.body if not content_source: return "" return content_source.get_text(separator=" ", strip=True) except httpx.HTTPStatusError as e: logger.debug(f"HTTP Error [{e.response.status_code}]: {e}") return "" except Exception as e: logger.debug(f"Error parsing webpage content: {e}") return ""