162 lines
5.6 KiB
Python
162 lines
5.6 KiB
Python
"""
|
||
Парсер первого источника - газета (hljnews.cn)
|
||
"""
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from datetime import datetime
|
||
from urllib.parse import urljoin
|
||
from typing import List
|
||
|
||
from .base import BaseParser
|
||
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH
|
||
from utils import logger, create_folder, get_current_date_parts
|
||
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
|
||
import work_parser as wp
|
||
|
||
|
||
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
|
||
"""
|
||
Извлекает ссылки из map/area тегов или li элементов
|
||
"""
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
|
||
}
|
||
|
||
resp = requests.get(url, headers=headers, verify=verify)
|
||
resp.raise_for_status()
|
||
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
hrefs = []
|
||
if ist_number == 1:
|
||
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
|
||
for a in map_tag.find_all("a", href=True):
|
||
href = a["href"]
|
||
abs_url = urljoin(url, href)
|
||
print(abs_url)
|
||
hrefs.append(abs_url)
|
||
else:
|
||
for map_tag in soup.find_all("map"):
|
||
for area in map_tag.find_all("area", href=True):
|
||
href = area["href"]
|
||
abs_url = urljoin(url, href)
|
||
hrefs.append(abs_url)
|
||
return hrefs
|
||
|
||
|
||
def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str:
|
||
"""
|
||
Извлекает текст из статьи первого источника (газета)
|
||
"""
|
||
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
|
||
|
||
soup = BeautifulSoup(response, "html.parser")
|
||
|
||
title_div = soup.find('div', class_='newsdetatit')
|
||
title_text = ''
|
||
if title_div:
|
||
h3_tag = title_div.find('h3')
|
||
if h3_tag:
|
||
title_text = h3_tag.get_text(strip=True)
|
||
|
||
content_div = soup.find('div', class_='newsdetatext')
|
||
content_text = ''
|
||
if content_div:
|
||
founder_content = content_div.find('founder-content')
|
||
if founder_content:
|
||
p_tags = founder_content.find_all('p')
|
||
content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)
|
||
|
||
text = title_text + content_text
|
||
|
||
if len(text) > MAX_ARTICLE_TEXT_LENGTH:
|
||
text = text[:MAX_ARTICLE_TEXT_LENGTH]
|
||
print(len(text))
|
||
|
||
return text
|
||
|
||
|
||
def check_url(url: str) -> bool:
|
||
"""
|
||
Проверяет, существует ли URL в базе данных
|
||
"""
|
||
try:
|
||
response = wp.check_url_exists(url)
|
||
if response.status_code == 200:
|
||
result = response.json()
|
||
print(result["exists"])
|
||
return result["exists"]
|
||
else:
|
||
return False
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
class Source1Parser(BaseParser):
|
||
"""
|
||
Парсер для первого источника - газета hljnews.cn
|
||
"""
|
||
|
||
def __init__(self):
|
||
super().__init__("source1")
|
||
|
||
def parse(self, data_init: str = "") -> None:
|
||
"""
|
||
Основной метод парсинга первого источника
|
||
"""
|
||
if data_init != ['']:
|
||
current_day = data_init[2]
|
||
current_month = data_init[1]
|
||
current_year = data_init[0]
|
||
else:
|
||
current_year, current_month, current_day = get_current_date_parts()
|
||
|
||
source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html'
|
||
self.start_task(source_url)
|
||
|
||
for page_number in range(1, 9):
|
||
url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
|
||
wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
|
||
|
||
print(f"Сбор href из: {url}")
|
||
try:
|
||
hrefs = extract_map_area_hrefs(url, ist_number=2)
|
||
except Exception as e:
|
||
print(f"Ошибка при извлечении ссылок: {e}")
|
||
logger.info(f"extract_map_area_hrefs: {e}")
|
||
continue
|
||
|
||
for i, link in enumerate(hrefs, 1):
|
||
if not check_url(link):
|
||
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
|
||
text = extract_text_from_url_one(link)
|
||
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
|
||
response_text = gpt_response_message(text, "source1")
|
||
print(response_text)
|
||
if response_text:
|
||
update_bd_and_create_document(
|
||
response_text=response_text,
|
||
article_date=f"{current_year}/{current_month}/{current_day}",
|
||
url=link,
|
||
parsed_at=str(datetime.now()),
|
||
original_text=text,
|
||
other="source1"
|
||
)
|
||
|
||
self.complete_task()
|
||
|
||
|
||
def start_pars_one_istochnik(data_init: str = "") -> None:
|
||
"""
|
||
Точка входа для парсинга первого источника
|
||
"""
|
||
parser = Source1Parser()
|
||
parser.parse(data_init)
|
||
|
||
|
||
def scheduled_parser_1() -> None:
|
||
"""
|
||
Функция для автоматического запуска по расписанию
|
||
"""
|
||
start_pars_one_istochnik("")
|