Files
parser/parsers/source2.py
Игорь Бандурист 25f2c09064
All checks were successful
continuous-integration/drone/push Build is passing
сделал ревью системы
2026-04-28 22:13:47 +10:00

162 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Парсер второго источника - военный (def.ltn.com.tw)
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
from typing import List, Tuple
from .base import BaseParser
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH
from utils import logger
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
import work_parser as wp
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
"""
Извлекает ссылки из map/area тегов или li элементов
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
}
resp = requests.get(url, headers=headers, verify=verify)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = []
if ist_number == 1:
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
for a in map_tag.find_all("a", href=True):
href = a["href"]
abs_url = urljoin(url, href)
print(abs_url)
hrefs.append(abs_url)
else:
for map_tag in soup.find_all("map"):
for area in map_tag.find_all("area", href=True):
href = area["href"]
abs_url = urljoin(url, href)
hrefs.append(abs_url)
return hrefs
def extract_text_from_url(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> Tuple[str, str]:
"""
Извлекает текст и дату из статьи второго источника (военный)
Возвращает кортеж (текст, дата)
"""
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
soup = BeautifulSoup(response, 'html.parser')
# Находим контейнер div.whitecon.article
container = soup.find("div", class_="whitecon article")
if not container:
return "", ""
# Получение заголовка <time> внутри контейнера
time_text = container.find('span')
time_t = ""
if time_text:
time_t = time_text.get_text(strip=True)
# Получение всех <p> внутри контейнера, исключая те с class="before_ir"
paragraphs = container.find_all('p')
# Возвращаем текстовую сводку
content_text = []
for p in paragraphs:
if p.get('class') != ['before_ir']:
content_text.append(p.get_text(strip=True))
return "\n".join(content_text), time_t
def check_url(url: str) -> bool:
"""
Проверяет, существует ли URL в базе данных
"""
try:
response = wp.check_url_exists(url)
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
return False
except Exception:
return False
class Source2Parser(BaseParser):
"""
Парсер для второго источника - военный def.ltn.com.tw
"""
def __init__(self):
super().__init__("source2")
def parse(self) -> None:
"""
Основной метод парсинга второго источника
"""
self.start_task('https://def.ltn.com.tw/')
istochnik = [
'https://def.ltn.com.tw/breakingnewslist',
'https://def.ltn.com.tw/list/11',
'https://def.ltn.com.tw/list/19',
'https://def.ltn.com.tw/list/17',
'https://def.ltn.com.tw/list/16'
]
all_links = []
for url in istochnik:
try:
print(f"Сбор href из: {url}")
all_links += extract_map_area_hrefs(url)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"Ошибка при извлечении ссылок: {e}")
continue
for hrefs in all_links:
if not check_url(hrefs):
try:
text, time_text = extract_text_from_url(hrefs)
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
response_text = gpt_response_message(text, "source2")
print(response_text)
if response_text:
update_bd_and_create_document(
response_text=response_text,
article_date=time_text,
url=hrefs,
parsed_at=str(datetime.utcnow()),
original_text=text,
other="source2"
)
except:
continue
self.complete_task()
def start_pars_two_istochnik() -> None:
"""
Точка входа для парсинга второго источника
"""
parser = Source2Parser()
parser.parse()
def scheduled_parser_2() -> None:
"""
Функция для автоматического запуска по расписанию
"""
start_pars_two_istochnik()