parser/parsers/source1.py

"""
Парсер первого источника - газета (hljnews.cn)
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
from typing import List

from .base import BaseParser
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH
from utils import logger, create_folder, get_current_date_parts
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
import work_parser as wp


def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
    """
    Извлекает ссылки из map/area тегов или li элементов
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
    }

    resp = requests.get(url, headers=headers, verify=verify)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    hrefs = []
    if ist_number == 1:
        for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
            for a in map_tag.find_all("a", href=True):
                href = a["href"]
                abs_url = urljoin(url, href)
                print(abs_url)
                hrefs.append(abs_url)
    else:
        for map_tag in soup.find_all("map"):
            for area in map_tag.find_all("area", href=True):
                href = area["href"]
                abs_url = urljoin(url, href)
                hrefs.append(abs_url)
    return hrefs


def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str:
    """
    Извлекает текст из статьи первого источника (газета)
    """
    response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)

    soup = BeautifulSoup(response, "html.parser")

    title_div = soup.find('div', class_='newsdetatit')
    title_text = ''
    if title_div:
        h3_tag = title_div.find('h3')
        if h3_tag:
            title_text = h3_tag.get_text(strip=True)

    content_div = soup.find('div', class_='newsdetatext')
    content_text = ''
    if content_div:
        founder_content = content_div.find('founder-content')
        if founder_content:
            p_tags = founder_content.find_all('p')
            content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)

    text = title_text + content_text

    if len(text) > MAX_ARTICLE_TEXT_LENGTH:
        text = text[:MAX_ARTICLE_TEXT_LENGTH]
        print(len(text))

    return text


def check_url(url: str) -> bool:
    """
    Проверяет, существует ли URL в базе данных
    """
    try:
        response = wp.check_url_exists(url)
        if response.status_code == 200:
            result = response.json()
            print(result["exists"])
            return result["exists"]
        else:
            return False
    except Exception:
        return False


class Source1Parser(BaseParser):
    """
    Парсер для первого источника - газета hljnews.cn
    """

    def __init__(self):
        super().__init__("source1")

    def parse(self, data_init: str = "") -> None:
        """
        Основной метод парсинга первого источника
        """
        if data_init != ['']:
            current_day = data_init[2]
            current_month = data_init[1]
            current_year = data_init[0]
        else:
            current_year, current_month, current_day = get_current_date_parts()

        source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html'
        self.start_task(source_url)

        for page_number in range(1, 9):
            url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
            wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())

            print(f"Сбор href из: {url}")
            try:
                hrefs = extract_map_area_hrefs(url, ist_number=2)
            except Exception as e:
                print(f"Ошибка при извлечении ссылок: {e}")
                logger.info(f"extract_map_area_hrefs: {e}")
                continue

            for i, link in enumerate(hrefs, 1):
                if not check_url(link):
                    print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
                    text = extract_text_from_url_one(link)
                    if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
                        response_text = gpt_response_message(text, "source1")
                        print(response_text)
                        if response_text:
                            update_bd_and_create_document(
                                response_text=response_text,
                                article_date=f"{current_year}/{current_month}/{current_day}",
                                url=link,
                                parsed_at=str(datetime.now()),
                                original_text=text,
                                other="source1"
                            )

        self.complete_task()


def start_pars_one_istochnik(data_init: str = "") -> None:
    """
    Точка входа для парсинга первого источника
    """
    parser = Source1Parser()
    parser.parse(data_init)


def scheduled_parser_1() -> None:
    """
    Функция для автоматического запуска по расписанию
    """
    start_pars_one_istochnik()