""" Парсер второго источника - военный (def.ltn.com.tw) """ import requests from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urljoin from typing import List, Tuple from .base import BaseParser from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH from utils import logger from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document import work_parser as wp def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]: """ Извлекает ссылки из map/area тегов или li элементов """ headers = { "User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)" } resp = requests.get(url, headers=headers, verify=verify) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") hrefs = [] if ist_number == 1: for map_tag in soup.find_all("li", attrs={"data-page": "1"}): for a in map_tag.find_all("a", href=True): href = a["href"] abs_url = urljoin(url, href) print(abs_url) hrefs.append(abs_url) else: for map_tag in soup.find_all("map"): for area in map_tag.find_all("area", href=True): href = area["href"] abs_url = urljoin(url, href) hrefs.append(abs_url) return hrefs def extract_text_from_url(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> Tuple[str, str]: """ Извлекает текст и дату из статьи второго источника (военный) Возвращает кортеж (текст, дата) """ response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify) soup = BeautifulSoup(response, 'html.parser') # Находим контейнер div.whitecon.article container = soup.find("div", class_="whitecon article") if not container: return "", "" # Получение заголовка