Compare commits
2 الالتزامات
f0a58c084e
...
main
| المؤلف | SHA1 | التاريخ | |
|---|---|---|---|
| 44ade8ccb4 | |||
| 5363fcb3e8 |
68
arabic (simple scraper) /arabic.py
Normal file
68
arabic (simple scraper) /arabic.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
#start------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
start_url = "https://dgam.gov.sy/?page_id=9606"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
visited = set()
|
||||||
|
to_visit = [start_url]
|
||||||
|
results = []
|
||||||
|
|
||||||
|
#claeaning-------------------------------------------------------------------------------
|
||||||
|
def clean_text(text):
|
||||||
|
|
||||||
|
return " ".join(text.split())
|
||||||
|
|
||||||
|
#crawler-------------------------------------------------------------------------------------
|
||||||
|
while to_visit:
|
||||||
|
url = to_visit.pop(0)
|
||||||
|
|
||||||
|
if url in visited:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("VISITING:", url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
except Exception as e:
|
||||||
|
print("Error:", e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# text-------------------------------------
|
||||||
|
paragraphs = soup.find_all("p")
|
||||||
|
text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
results.append({
|
||||||
|
"url": url,
|
||||||
|
"content": text
|
||||||
|
})
|
||||||
|
|
||||||
|
#links-----------------------------------------------------
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
new_url = urljoin(url, link["href"])
|
||||||
|
# نتحقق أن الرابط ضمن نفس الدومين
|
||||||
|
if urlparse(new_url).netloc == urlparse(start_url).netloc:
|
||||||
|
if new_url not in visited and new_url not in to_visit:
|
||||||
|
to_visit.append(new_url)
|
||||||
|
|
||||||
|
visited.add(url)
|
||||||
|
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
#json---------------------------------------------------------------
|
||||||
|
with open("site_full_data.json", "w", encoding="utf-8") as f:
|
||||||
|
json.dump(results, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print("SCRAPING FINISHED. Total pages:", len(results))
|
||||||
1855
arabic (simple scraper) /site_full_data.json
Normal file
1855
arabic (simple scraper) /site_full_data.json
Normal file
تم حذف اختلاف الملف لأن أحد الأسطر أو أكثر طويلة جداً
المرجع في مشكلة جديدة
حظر مستخدم