Files
mdscrap/scraper/static_scraper.py
Beyhan Oğur 9630a33ec1 first commit
2026-04-26 22:00:50 +03:00

44 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class StaticScraper:
def __init__(self, base_url: str):
self.base_url = base_url
# Temel header'lar ekleyerek bot korumalarını geçmeye çalışıyoruz
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def fetch_page(self, url_path: str = "") -> str | None:
"""Belirtilen URL'ye HTTP GET isteği atar ve HTML içeriğini döndürür."""
url = f"{self.base_url}{url_path}"
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status() # Hata kodlarında exception fırlatır (örn: 404, 500)
logging.info(f"Başarılı bir şekilde çekildi: {url}")
return response.text
except requests.RequestException as e:
logging.error(f"Sayfa çekilirken hata oluştu ({url}): {e}")
return None
def parse_title(self, html_content: str) -> str | None:
"""HTML içeriğinden sayfa başlığını (title) ayrıştırır."""
if not html_content:
return None
soup = BeautifulSoup(html_content, "lxml")
title_tag = soup.find("title")
return title_tag.text.strip() if title_tag else None
def extract_links(self, html_content: str) -> list[str]:
"""Sayfadaki tüm bağlantıları (a etiketlerini) çıkartır."""
if not html_content:
return []
soup = BeautifulSoup(html_content, "lxml")
links = []
for a_tag in soup.find_all("a", href=True):
links.append(a_tag["href"])
return links