""" Парсер первого источника - газета (hljnews.cn) """ import requests from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urljoin from typing import List from .base import BaseParser from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH from utils import logger, create_folder, get_current_date_parts from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document import work_parser as wp def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]: """ Извлекает ссылки из map/area тегов или li элементов """ headers = { "User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)" } resp = requests.get(url, headers=headers, verify=verify) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") hrefs = [] if ist_number == 1: for map_tag in soup.find_all("li", attrs={"data-page": "1"}): for a in map_tag.find_all("a", href=True): href = a["href"] abs_url = urljoin(url, href) print(abs_url) hrefs.append(abs_url) else: for map_tag in soup.find_all("map"): for area in map_tag.find_all("area", href=True): href = area["href"] abs_url = urljoin(url, href) hrefs.append(abs_url) return hrefs def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str: """ Извлекает текст из статьи первого источника (газета) """ response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify) soup = BeautifulSoup(response, "html.parser") title_div = soup.find('div', class_='newsdetatit') title_text = '' if title_div: h3_tag = title_div.find('h3') if h3_tag: title_text = h3_tag.get_text(strip=True) content_div = soup.find('div', class_='newsdetatext') content_text = '' if content_div: founder_content = content_div.find('founder-content') if founder_content: p_tags = founder_content.find_all('p') content_text = '\n'.join(p.get_text(strip=True) for p in p_tags) text = title_text + content_text if len(text) > MAX_ARTICLE_TEXT_LENGTH: text = text[:MAX_ARTICLE_TEXT_LENGTH] print(len(text)) return text def check_url(url: str) -> bool: """ Проверяет, существует ли URL в базе данных """ try: response = wp.check_url_exists(url) if response.status_code == 200: result = response.json() print(result["exists"]) return result["exists"] else: return False except Exception: return False class Source1Parser(BaseParser): """ Парсер для первого источника - газета hljnews.cn """ def __init__(self): super().__init__("source1") def parse(self, data_init: str = "") -> None: """ Основной метод парсинга первого источника """ if data_init != ['']: current_day = data_init[2] current_month = data_init[1] current_year = data_init[0] else: current_year, current_month, current_day = get_current_date_parts() source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html' self.start_task(source_url) for page_number in range(1, 9): url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html' wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow()) print(f"Сбор href из: {url}") try: hrefs = extract_map_area_hrefs(url, ist_number=2) except Exception as e: print(f"Ошибка при извлечении ссылок: {e}") logger.info(f"extract_map_area_hrefs: {e}") continue for i, link in enumerate(hrefs, 1): if not check_url(link): print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}") text = extract_text_from_url_one(link) if len(text) >= MIN_ARTICLE_TEXT_LENGTH: response_text = gpt_response_message(text, "source1") print(response_text) if response_text: update_bd_and_create_document( response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(datetime.now()), original_text=text, other="source1" ) self.complete_task() def start_pars_one_istochnik(data_init: str = "") -> None: """ Точка входа для парсинга первого источника """ parser = Source1Parser() parser.parse(data_init) def scheduled_parser_1() -> None: """ Функция для автоматического запуска по расписанию """ start_pars_one_istochnik("")