Files
mdscrap/scraper/docs_crawler.py
Beyhan Oğur 9630a33ec1 first commit
2026-04-26 22:00:50 +03:00

139 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import time
import logging
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from playwright.sync_api import sync_playwright
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class DocsCrawler:
def __init__(self, start_url: str, max_pages: int = 50, headless: bool = True, allowed_path: str = None):
self.start_url = start_url
self.max_pages = max_pages
self.headless = headless
self.visited = set()
self.queue = [start_url]
parsed_url = urlparse(start_url)
self.base_domain = parsed_url.netloc
self.base_path = allowed_path if allowed_path is not None else parsed_url.path
# Sadece bu prefix ile başlayan linkleri gezeceğiz
self.url_prefix = f"{parsed_url.scheme}://{self.base_domain}{self.base_path}"
def is_valid_url(self, url: str) -> bool:
"""Sadece aynı domain ve prefix altındaki doküman linklerini geçerli sayar."""
if url in self.visited:
return False
if not url.startswith(self.url_prefix):
return False
# Gereksiz dosyaları atla
if any(url.endswith(ext) for ext in [".png", ".jpg", ".pdf", ".zip", ".json"]):
return False
return True
def save_markdown(self, url: str, content: str):
"""Çekilen markdown'ı URL yapısına göre klasörleyerek kaydeder."""
parsed = urlparse(url)
# Örn: /docs/app/building-your-application -> docs/app/building-your-application
path = parsed.path.strip("/")
if not path:
path = "index"
# Dosya yolu oluştur
file_path = os.path.join("data", "md_docs", self.base_domain, f"{path}.md")
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
logging.info(f"Kaydedildi: {file_path}")
def extract_main_content(self, html_content: str) -> str:
"""HTML içerisinden menüleri/sidearları atıp asıl metni alır."""
soup = BeautifulSoup(html_content, "lxml")
# Çoğu modern docs sitesi içeriği article veya main etiketinde tutar.
main_content = soup.find("article") or soup.find("main")
# Eğer bulunamazsa div id="content" gibi ortak yapıları ara
if not main_content:
main_content = soup.find("div", id="content") or soup.find("div", class_="content")
# Hiçbiri yoksa fallback olarak body kullan
if not main_content:
main_content = soup.find("body")
# Gereksiz script, style ve nav etiketlerini temizle
if main_content:
for unwanted in main_content(["script", "style", "nav", "footer", "header"]):
unwanted.decompose()
return str(main_content)
return ""
def crawl(self):
"""Kuyruktaki URL'leri sırayla gezip indirir."""
logging.info(f"Tarama başlatılıyor: {self.start_url}")
with sync_playwright() as p:
browser = p.chromium.launch(headless=self.headless)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
count = 0
while self.queue and count < self.max_pages:
current_url = self.queue.pop(0)
# Sayfadaki # (anchor) linklerini temizleyerek ziyaret edildi say
clean_url = current_url.split("#")[0]
if clean_url in self.visited:
continue
self.visited.add(clean_url)
count += 1
logging.info(f"[{count}/{self.max_pages}] Çekiliyor: {clean_url}")
try:
# networkidle JS ile yüklenen siteler için önemlidir
page.goto(clean_url, wait_until="networkidle", timeout=30000)
html_content = page.content()
# 1. Main içeriği bul ve ayıkla
content_html = self.extract_main_content(html_content)
if not content_html:
logging.warning(f"İçerik bulunamadı: {clean_url}")
continue
# 2. HTML'i Markdown'a dönüştür
# heading_style="ATX" -> ## Başlık (daha iyi markdown formatı)
md_text = md(content_html, heading_style="ATX", default_title=True)
# URL'i başlığa ekleyebiliriz referans için
md_text = f"# Source: {clean_url}\n\n" + md_text
# 3. Kaydet
self.save_markdown(clean_url, md_text)
# 4. Sayfadaki yeni linkleri bul ve sıraya ekle
soup = BeautifulSoup(html_content, "lxml")
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
# Tam URL'yi oluştur
full_url = urljoin(clean_url, href).split("#")[0]
if self.is_valid_url(full_url) and full_url not in self.queue:
self.queue.append(full_url)
except Exception as e:
logging.error(f"Hata oluştu ({clean_url}): {e}")
time.sleep(1) # Aşırı yüklenmemek için ufak bekleme
browser.close()
logging.info("Tarama tamamlandı.")