diff --git a/services/scrape.py b/services/scrape.py new file mode 100644 index 0000000..1f31cbf --- /dev/null +++ b/services/scrape.py @@ -0,0 +1,15 @@ +import httpx +from bs4 import BeautifulSoup + + +# Function: fetch webpage content and extract plain text +async def fetch_url(url: str): + async with httpx.AsyncClient() as client: + try: + response = await client.get(url, timeout=30.0) + soup = BeautifulSoup(response.text, "html.parser") + # Clean text (remove extra spaces/newlines) + text = " ".join(soup.get_text().split()) + return text + except httpx.TimeoutException: + return "Timeout error"