Source code for toolregistry.hub.websearch.websearch_google

import random
from concurrent.futures import ProcessPoolExecutor
from time import sleep
from typing import Dict, Generator, List, Optional, Set
from urllib.parse import unquote  # to decode the url

import httpx
from bs4 import BeautifulSoup
from loguru import logger

from .websearch import TIMEOUT_DEFAULT, WebSearchGeneral


def _get_lynx_useragent():
    """
    Generates a random user agent string mimicking the format of various software versions.

    The user agent string is composed of:
    - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
    - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
    - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
    - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9

    Returns:
        str: A randomly generated user agent string.
    """
    lynx_version = (
        f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
    )
    libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
    ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
    openssl_version = (
        f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
    )
    return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"


class _WebSearchEntryGoogle(dict):
    def __init__(self, **data):
        super().__init__(**data)

    url: str
    title: str
    content: str


[docs] class WebSearchGoogle(WebSearchGeneral): """WebSearchGoogle provides a unified interface for performing web searches on Google. It handles search queries and result processing. Features: - Performs web searches using Google - Returns formatted results with title, URL and description - Supports proxy and region settings Examples: >>> from toolregistry.hub.websearch_google import WebSearchGoogle >>> searcher = WebSearchGoogle() >>> results = searcher.search("python web scraping", number_results=3) >>> for result in results: ... print(result["title"]) """
[docs] def __init__( self, google_base_url: str = "https://www.google.com", proxy: Optional[str] = None, ): """Initialize WebSearchGoogle with configuration parameters. Args: google_base_url (str): Base URL for the Google search. Defaults to "https://www.google.com". proxy: Optional proxy server URL (e.g. "http://proxy.example.com:8080") """ self.google_base_url = google_base_url.rstrip("/") if not self.google_base_url.endswith("/search"): self.google_base_url += "/search" # Ensure the URL ends with /search self.proxy: Optional[str] = proxy if proxy else None
[docs] def search( self, query: str, number_results: int = 5, threshold: float = 0.2, # Not used in this implementation, kept for compatibility. timeout: Optional[float] = None, ) -> List[Dict[str, str]]: """Perform search and return results. Args: query: The search query. number_results: The maximum number of results to return. Default is 5. timeout: Optional timeout override in seconds. Returns: List of search results, each containing: - 'title': The title of the search result - 'url': The URL of the search result - 'content': The description/content from Google - 'excerpt': Same as content (for compatibility with WebSearchSearxng) """ try: results = WebSearchGoogle._meta_search_google( query, num_results=number_results, proxy=self.proxy, timeout=timeout or TIMEOUT_DEFAULT, google_base_url=self.google_base_url, ) # TODO: find out how to get score from results filtered_results = results with ProcessPoolExecutor() as executor: enriched_results = list( executor.map( self._fetch_webpage_content, filtered_results, ) ) return enriched_results except httpx.RequestError as e: logger.debug(f"Request error: {e}") return [] except httpx.HTTPStatusError as e: logger.debug(f"HTTP error: {e.response.status_code}") return []
@staticmethod def _meta_search_google( query, num_results=10, proxy: Optional[str] = None, sleep_interval: float = 0, timeout: float = 5, start_num: int = 0, google_base_url: str = "https://www.google.com/search", ) -> List[_WebSearchEntryGoogle]: """Search the Google search engine""" results = [] fetched_results = 0 fetched_links: Set[str] = set() # Create a persistent client with connection pooling with httpx.Client( proxy=proxy, headers={ "User-Agent": _get_lynx_useragent(), "Accept": "*/*", }, timeout=timeout, ) as client: start = start_num while fetched_results < num_results: response = client.get( url=google_base_url, params={ "q": query, "num": num_results - start + 2, "start": start, }, cookies={ "CONSENT": "PENDING+987", "SOCS": "CAESHAgBEhIaAB", }, ) response.raise_for_status() batch_entries = list( WebSearchGoogle._parse_google_entries( response.text, fetched_links, num_results - fetched_results ) ) if len(batch_entries) == 0: break fetched_results += len(batch_entries) results.extend(batch_entries) start += 10 sleep(sleep_interval) return results @staticmethod def _parse_google_entries( html: str, fetched_links: Set[str], num_results: int ) -> Generator[_WebSearchEntryGoogle, None, None]: """Parse HTML content from Google search results.""" soup = BeautifulSoup(html, "html.parser") result_block = soup.find_all("div", class_="ezO2md") new_results = 0 for result in result_block: if new_results >= num_results: break # Skip non-Tag elements if not hasattr(result, "find"): continue link_tag = result.find("a", href=True) # Skip non-Tag elements if not link_tag or not hasattr(link_tag, "find"): continue title_tag = link_tag.find("span", class_="CVA68e") description_tag = result.find("span", class_="FrIlee") if not (link_tag and title_tag and description_tag): continue try: link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link in fetched_links: continue fetched_links.add(link) title = title_tag.text if title_tag else "" description = description_tag.text if description_tag else "" new_results += 1 yield _WebSearchEntryGoogle( title=title, url=link, content=description, ) except (AttributeError, KeyError, TypeError) as e: logger.debug(f"Error parsing search result: {e}") continue
if __name__ == "__main__": import json # Example usage searcher = WebSearchGoogle() results = searcher.search("巴塞罗那今日天气", 5) for result in results: print(json.dumps(result, indent=2, ensure_ascii=False)) print(searcher.extract(results[0]["url"]))