Files
parser/parsers/source1.py
Игорь Бандурист 25f2c09064
All checks were successful
continuous-integration/drone/push Build is passing
сделал ревью системы
2026-04-28 22:13:47 +10:00

162 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Парсер первого источника - газета (hljnews.cn)
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
from typing import List
from .base import BaseParser
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH
from utils import logger, create_folder, get_current_date_parts
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
import work_parser as wp
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
"""
Извлекает ссылки из map/area тегов или li элементов
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
}
resp = requests.get(url, headers=headers, verify=verify)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = []
if ist_number == 1:
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
for a in map_tag.find_all("a", href=True):
href = a["href"]
abs_url = urljoin(url, href)
print(abs_url)
hrefs.append(abs_url)
else:
for map_tag in soup.find_all("map"):
for area in map_tag.find_all("area", href=True):
href = area["href"]
abs_url = urljoin(url, href)
hrefs.append(abs_url)
return hrefs
def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str:
"""
Извлекает текст из статьи первого источника (газета)
"""
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
soup = BeautifulSoup(response, "html.parser")
title_div = soup.find('div', class_='newsdetatit')
title_text = ''
if title_div:
h3_tag = title_div.find('h3')
if h3_tag:
title_text = h3_tag.get_text(strip=True)
content_div = soup.find('div', class_='newsdetatext')
content_text = ''
if content_div:
founder_content = content_div.find('founder-content')
if founder_content:
p_tags = founder_content.find_all('p')
content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)
text = title_text + content_text
if len(text) > MAX_ARTICLE_TEXT_LENGTH:
text = text[:MAX_ARTICLE_TEXT_LENGTH]
print(len(text))
return text
def check_url(url: str) -> bool:
"""
Проверяет, существует ли URL в базе данных
"""
try:
response = wp.check_url_exists(url)
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
return False
except Exception:
return False
class Source1Parser(BaseParser):
"""
Парсер для первого источника - газета hljnews.cn
"""
def __init__(self):
super().__init__("source1")
def parse(self, data_init: str = "") -> None:
"""
Основной метод парсинга первого источника
"""
if data_init != ['']:
current_day = data_init[2]
current_month = data_init[1]
current_year = data_init[0]
else:
current_year, current_month, current_day = get_current_date_parts()
source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html'
self.start_task(source_url)
for page_number in range(1, 9):
url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
print(f"Сбор href из: {url}")
try:
hrefs = extract_map_area_hrefs(url, ist_number=2)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"extract_map_area_hrefs: {e}")
continue
for i, link in enumerate(hrefs, 1):
if not check_url(link):
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
text = extract_text_from_url_one(link)
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
response_text = gpt_response_message(text, "source1")
print(response_text)
if response_text:
update_bd_and_create_document(
response_text=response_text,
article_date=f"{current_year}/{current_month}/{current_day}",
url=link,
parsed_at=str(datetime.now()),
original_text=text,
other="source1"
)
self.complete_task()
def start_pars_one_istochnik(data_init: str = "") -> None:
"""
Точка входа для парсинга первого источника
"""
parser = Source1Parser()
parser.parse(data_init)
def scheduled_parser_1() -> None:
"""
Функция для автоматического запуска по расписанию
"""
start_pars_one_istochnik()