import random
from concurrent.futures import ProcessPoolExecutor
from time import sleep
from typing import Dict, Generator, List, Optional, Set
from urllib.parse import unquote # to decode the url
import httpx
from bs4 import BeautifulSoup
from loguru import logger
from .websearch import TIMEOUT_DEFAULT, WebSearchGeneral
def _get_lynx_useragent():
"""
Generates a random user agent string mimicking the format of various software versions.
The user agent string is composed of:
- Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
- libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
- SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
- OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
Returns:
str: A randomly generated user agent string.
"""
lynx_version = (
f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
)
libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
openssl_version = (
f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
)
return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"
class _WebSearchEntryGoogle(dict):
def __init__(self, **data):
super().__init__(**data)
url: str
title: str
content: str
[docs]
class WebSearchGoogle(WebSearchGeneral):
"""WebSearchGoogle provides a unified interface for performing web searches on Google.
It handles search queries and result processing.
Features:
- Performs web searches using Google
- Returns formatted results with title, URL and description
- Supports proxy and region settings
Examples:
>>> from toolregistry.hub.websearch_google import WebSearchGoogle
>>> searcher = WebSearchGoogle()
>>> results = searcher.search("python web scraping", number_results=3)
>>> for result in results:
... print(result["title"])
"""
[docs]
def __init__(
self,
google_base_url: str = "https://www.google.com",
proxy: Optional[str] = None,
):
"""Initialize WebSearchGoogle with configuration parameters.
Args:
google_base_url (str): Base URL for the Google search. Defaults to "https://www.google.com".
proxy: Optional proxy server URL (e.g. "http://proxy.example.com:8080")
"""
self.google_base_url = google_base_url.rstrip("/")
if not self.google_base_url.endswith("/search"):
self.google_base_url += "/search" # Ensure the URL ends with /search
self.proxy: Optional[str] = proxy if proxy else None
[docs]
def search(
self,
query: str,
number_results: int = 5,
threshold: float = 0.2, # Not used in this implementation, kept for compatibility.
timeout: Optional[float] = None,
) -> List[Dict[str, str]]:
"""Perform search and return results.
Args:
query: The search query.
number_results: The maximum number of results to return. Default is 5.
timeout: Optional timeout override in seconds.
Returns:
List of search results, each containing:
- 'title': The title of the search result
- 'url': The URL of the search result
- 'content': The description/content from Google
- 'excerpt': Same as content (for compatibility with WebSearchSearxng)
"""
try:
results = WebSearchGoogle._meta_search_google(
query,
num_results=number_results,
proxy=self.proxy,
timeout=timeout or TIMEOUT_DEFAULT,
google_base_url=self.google_base_url,
)
# TODO: find out how to get score from results
filtered_results = results
with ProcessPoolExecutor() as executor:
enriched_results = list(
executor.map(
self._fetch_webpage_content,
filtered_results,
)
)
return enriched_results
except httpx.RequestError as e:
logger.debug(f"Request error: {e}")
return []
except httpx.HTTPStatusError as e:
logger.debug(f"HTTP error: {e.response.status_code}")
return []
@staticmethod
def _meta_search_google(
query,
num_results=10,
proxy: Optional[str] = None,
sleep_interval: float = 0,
timeout: float = 5,
start_num: int = 0,
google_base_url: str = "https://www.google.com/search",
) -> List[_WebSearchEntryGoogle]:
"""Search the Google search engine"""
results = []
fetched_results = 0
fetched_links: Set[str] = set()
# Create a persistent client with connection pooling
with httpx.Client(
proxy=proxy,
headers={
"User-Agent": _get_lynx_useragent(),
"Accept": "*/*",
},
timeout=timeout,
) as client:
start = start_num
while fetched_results < num_results:
response = client.get(
url=google_base_url,
params={
"q": query,
"num": num_results - start + 2,
"start": start,
},
cookies={
"CONSENT": "PENDING+987",
"SOCS": "CAESHAgBEhIaAB",
},
)
response.raise_for_status()
batch_entries = list(
WebSearchGoogle._parse_google_entries(
response.text, fetched_links, num_results - fetched_results
)
)
if len(batch_entries) == 0:
break
fetched_results += len(batch_entries)
results.extend(batch_entries)
start += 10
sleep(sleep_interval)
return results
@staticmethod
def _parse_google_entries(
html: str, fetched_links: Set[str], num_results: int
) -> Generator[_WebSearchEntryGoogle, None, None]:
"""Parse HTML content from Google search results."""
soup = BeautifulSoup(html, "html.parser")
result_block = soup.find_all("div", class_="ezO2md")
new_results = 0
for result in result_block:
if new_results >= num_results:
break
# Skip non-Tag elements
if not hasattr(result, "find"):
continue
link_tag = result.find("a", href=True)
# Skip non-Tag elements
if not link_tag or not hasattr(link_tag, "find"):
continue
title_tag = link_tag.find("span", class_="CVA68e")
description_tag = result.find("span", class_="FrIlee")
if not (link_tag and title_tag and description_tag):
continue
try:
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
if link in fetched_links:
continue
fetched_links.add(link)
title = title_tag.text if title_tag else ""
description = description_tag.text if description_tag else ""
new_results += 1
yield _WebSearchEntryGoogle(
title=title,
url=link,
content=description,
)
except (AttributeError, KeyError, TypeError) as e:
logger.debug(f"Error parsing search result: {e}")
continue
if __name__ == "__main__":
import json
# Example usage
searcher = WebSearchGoogle()
results = searcher.search("巴塞罗那今日天气", 5)
for result in results:
print(json.dumps(result, indent=2, ensure_ascii=False))
print(searcher.extract(results[0]["url"]))