import re
import sys
import unicodedata
from abc import ABC, abstractmethod
from typing import Literal, Optional
import httpx
from bs4 import BeautifulSoup
from loguru import logger
_UNABLE_TO_FETCH_CONTENT = "Unable to fetch content"
_UNABLE_TO_FETCH_TITLE = "Unable to fetch title"
if sys.version_info < (3, 9):
HEADERS_DEFAULT = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
}
else:
from fake_useragent import UserAgent
HEADERS_DEFAULT = {"User-Agent": (UserAgent(platforms="mobile").random)}
TIMEOUT_DEFAULT = 10.0
class _WebSearchEntryGeneral(dict):
def __init__(self, **data):
super().__init__(**data)
url: str
title: str
content: str
[docs]
class WebSearchGeneral(ABC):
[docs]
@abstractmethod
def search(
self,
query: str,
number_results: int = 5,
threshold: float = 0.2,
timeout: Optional[float] = None,
) -> list:
"""Perform search and return results.
Args:
query (str): The search query.
number_results (int, optional): The maximum number of results to return. Defaults to 5.
threshold (float, optional): Minimum score threshold for results [0-1.0]. Defaults to 0.2.
timeout (float, optional): Request timeout in seconds. Defaults to None.
Returns:
list: A list of search results.
"""
pass
@staticmethod
def _fetch_webpage_content(entry: _WebSearchEntryGeneral) -> dict:
"""Retrieve complete webpage content from search result entry.
Args:
entry (_WebSearchEntryGeneral): The search result entry.
Returns:
Dict[str, str]: A dictionary containing the title, URL, content, and excerpt of the webpage.
"""
url = entry["url"]
if not url:
raise ValueError("Result missing URL")
try:
content = WebSearchGeneral.extract(url)
except Exception as e:
content = _UNABLE_TO_FETCH_CONTENT
logger.debug(f"Error retrieving webpage content: {e}")
return {
"title": entry.get("title", _UNABLE_TO_FETCH_TITLE),
"url": url,
"content": content,
"excerpt": entry.get("content", _UNABLE_TO_FETCH_CONTENT),
}
@staticmethod
def _remove_emojis(text: str) -> str:
"""Remove emoji expressions from text.
Args:
text (str): The input text.
Returns:
str: Text with emojis removed.
"""
return "".join(c for c in text if not unicodedata.category(c).startswith("So"))
@staticmethod
def _format_text(text: str) -> str:
"""Format text content.
Args:
text (str): The input text.
Returns:
str: Formatted text.
"""
text = unicodedata.normalize("NFKC", text)
text = re.sub(r"[^\S\n]+", " ", text)
text = re.sub(r"\n+", "\n", text)
text = text.strip()
text = WebSearchGeneral._remove_emojis(text)
return text
@staticmethod
def _get_content_with_jina_reader(
url: str,
return_format: Literal["markdown", "text", "html"] = "text",
timeout: Optional[float] = None,
) -> str:
"""Fetch parsed content from Jina AI for a given URL.
Args:
url (str): The URL to fetch content from.
return_format (Literal["markdown", "text", "html"], optional): The format of the returned content. Defaults to "text".
timeout (Optional[float], optional): Timeout for the HTTP request. Defaults to TIMEOUT_DEFAULT.
Returns:
str: Parsed content from Jina AI.
"""
try:
headers = {
"X-Return-Format": return_format,
"X-Remove-Selector": "header, .class, #id",
"X-Target-Selector": "body, .class, #id",
}
jina_reader_url = "https://r.jina.ai/"
response = httpx.get(
jina_reader_url + url,
headers=headers,
timeout=timeout or TIMEOUT_DEFAULT,
)
response.raise_for_status()
return response.text
except httpx.HTTPStatusError as e:
logger.debug(f"HTTP Error [{e.response.status_code}]: {e}")
return ""
except Exception as e:
logger.debug(f"Other error: {e}")
return ""
@staticmethod
def _get_content_with_bs4(
url: str,
timeout: Optional[float] = None,
) -> str:
"""Utilizes BeautifulSoup to fetch and parse the content of a webpage.
Args:
url (str): The URL of the webpage.
headers (Optional[Dict[str, str]]): HTTP headers to be sent with the request. Defaults to HEADERS_DEFAULT.
timeout (Optional[float]): Timeout for the HTTP request. Defaults to TIMEOUT_DEFAULT.
Returns:
str: Parsed text content of the webpage.
"""
try:
response = httpx.get(
url,
headers=HEADERS_DEFAULT,
timeout=timeout or TIMEOUT_DEFAULT,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
for element in soup(
["script", "style", "nav", "footer", "iframe", "noscript"]
):
element.decompose()
main_content = (
soup.find("main")
or soup.find("article")
or soup.find("div", {"class": "content"})
)
content_source = main_content if main_content else soup.body
if not content_source:
return ""
return content_source.get_text(separator=" ", strip=True)
except httpx.HTTPStatusError as e:
logger.debug(f"HTTP Error [{e.response.status_code}]: {e}")
return ""
except Exception as e:
logger.debug(f"Error parsing webpage content: {e}")
return ""