This commit is contained in:
19
parsers/__init__.py
Normal file
19
parsers/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
Парсеры приложения
|
||||
"""
|
||||
from .base import BaseParser
|
||||
from .source1 import Source1Parser, start_pars_one_istochnik, scheduled_parser_1
|
||||
from .source2 import Source2Parser, start_pars_two_istochnik, scheduled_parser_2
|
||||
from .universal import UniversalParser, start_pars_all_istochnik
|
||||
|
||||
__all__ = [
|
||||
'BaseParser',
|
||||
'Source1Parser',
|
||||
'start_pars_one_istochnik',
|
||||
'scheduled_parser_1',
|
||||
'Source2Parser',
|
||||
'start_pars_two_istochnik',
|
||||
'scheduled_parser_2',
|
||||
'UniversalParser',
|
||||
'start_pars_all_istochnik'
|
||||
]
|
||||
47
parsers/base.py
Normal file
47
parsers/base.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Базовый класс парсера
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
import work_parser as wp
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""
|
||||
Базовый класс для всех парсеров
|
||||
"""
|
||||
|
||||
def __init__(self, source_name: str):
|
||||
self.source_name = source_name
|
||||
self.task_id = None
|
||||
|
||||
def start_task(self, source_url: str) -> int:
|
||||
"""
|
||||
Создаёт задачу парсинга и возвращает её ID
|
||||
"""
|
||||
self.task_id = wp.insert_task(status='queued', source_url=source_url)
|
||||
print(f"Создана задача с id: {self.task_id}")
|
||||
return self.task_id
|
||||
|
||||
def complete_task(self) -> None:
|
||||
"""
|
||||
Завершает задачу парсинга
|
||||
"""
|
||||
if self.task_id:
|
||||
from datetime import datetime
|
||||
wp.update_task(self.task_id, status='completed', finished_at=datetime.utcnow())
|
||||
|
||||
def fail_task(self) -> None:
|
||||
"""
|
||||
Отмечает задачу как неудачную
|
||||
"""
|
||||
if self.task_id:
|
||||
from datetime import datetime
|
||||
wp.update_task(self.task_id, status='failed', finished_at=datetime.utcnow())
|
||||
|
||||
@abstractmethod
|
||||
def parse(self) -> None:
|
||||
"""
|
||||
Основной метод парсинга - должен быть реализован в наследниках
|
||||
"""
|
||||
pass
|
||||
161
parsers/source1.py
Normal file
161
parsers/source1.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Парсер первого источника - газета (hljnews.cn)
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin
|
||||
from typing import List
|
||||
|
||||
from .base import BaseParser
|
||||
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH
|
||||
from utils import logger, create_folder, get_current_date_parts
|
||||
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
|
||||
import work_parser as wp
|
||||
|
||||
|
||||
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
|
||||
"""
|
||||
Извлекает ссылки из map/area тегов или li элементов
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
|
||||
}
|
||||
|
||||
resp = requests.get(url, headers=headers, verify=verify)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
hrefs = []
|
||||
if ist_number == 1:
|
||||
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
|
||||
for a in map_tag.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
abs_url = urljoin(url, href)
|
||||
print(abs_url)
|
||||
hrefs.append(abs_url)
|
||||
else:
|
||||
for map_tag in soup.find_all("map"):
|
||||
for area in map_tag.find_all("area", href=True):
|
||||
href = area["href"]
|
||||
abs_url = urljoin(url, href)
|
||||
hrefs.append(abs_url)
|
||||
return hrefs
|
||||
|
||||
|
||||
def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str:
|
||||
"""
|
||||
Извлекает текст из статьи первого источника (газета)
|
||||
"""
|
||||
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
|
||||
|
||||
soup = BeautifulSoup(response, "html.parser")
|
||||
|
||||
title_div = soup.find('div', class_='newsdetatit')
|
||||
title_text = ''
|
||||
if title_div:
|
||||
h3_tag = title_div.find('h3')
|
||||
if h3_tag:
|
||||
title_text = h3_tag.get_text(strip=True)
|
||||
|
||||
content_div = soup.find('div', class_='newsdetatext')
|
||||
content_text = ''
|
||||
if content_div:
|
||||
founder_content = content_div.find('founder-content')
|
||||
if founder_content:
|
||||
p_tags = founder_content.find_all('p')
|
||||
content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)
|
||||
|
||||
text = title_text + content_text
|
||||
|
||||
if len(text) > MAX_ARTICLE_TEXT_LENGTH:
|
||||
text = text[:MAX_ARTICLE_TEXT_LENGTH]
|
||||
print(len(text))
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def check_url(url: str) -> bool:
|
||||
"""
|
||||
Проверяет, существует ли URL в базе данных
|
||||
"""
|
||||
try:
|
||||
response = wp.check_url_exists(url)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(result["exists"])
|
||||
return result["exists"]
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class Source1Parser(BaseParser):
|
||||
"""
|
||||
Парсер для первого источника - газета hljnews.cn
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("source1")
|
||||
|
||||
def parse(self, data_init: str = "") -> None:
|
||||
"""
|
||||
Основной метод парсинга первого источника
|
||||
"""
|
||||
if data_init != ['']:
|
||||
current_day = data_init[2]
|
||||
current_month = data_init[1]
|
||||
current_year = data_init[0]
|
||||
else:
|
||||
current_year, current_month, current_day = get_current_date_parts()
|
||||
|
||||
source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html'
|
||||
self.start_task(source_url)
|
||||
|
||||
for page_number in range(1, 9):
|
||||
url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
|
||||
wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
|
||||
|
||||
print(f"Сбор href из: {url}")
|
||||
try:
|
||||
hrefs = extract_map_area_hrefs(url, ist_number=2)
|
||||
except Exception as e:
|
||||
print(f"Ошибка при извлечении ссылок: {e}")
|
||||
logger.info(f"extract_map_area_hrefs: {e}")
|
||||
continue
|
||||
|
||||
for i, link in enumerate(hrefs, 1):
|
||||
if not check_url(link):
|
||||
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
|
||||
text = extract_text_from_url_one(link)
|
||||
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
|
||||
response_text = gpt_response_message(text, "source1")
|
||||
print(response_text)
|
||||
if response_text:
|
||||
update_bd_and_create_document(
|
||||
response_text=response_text,
|
||||
article_date=f"{current_year}/{current_month}/{current_day}",
|
||||
url=link,
|
||||
parsed_at=str(datetime.now()),
|
||||
original_text=text,
|
||||
other="source1"
|
||||
)
|
||||
|
||||
self.complete_task()
|
||||
|
||||
|
||||
def start_pars_one_istochnik(data_init: str = "") -> None:
|
||||
"""
|
||||
Точка входа для парсинга первого источника
|
||||
"""
|
||||
parser = Source1Parser()
|
||||
parser.parse(data_init)
|
||||
|
||||
|
||||
def scheduled_parser_1() -> None:
|
||||
"""
|
||||
Функция для автоматического запуска по расписанию
|
||||
"""
|
||||
start_pars_one_istochnik()
|
||||
161
parsers/source2.py
Normal file
161
parsers/source2.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Парсер второго источника - военный (def.ltn.com.tw)
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from urllib.parse import urljoin
|
||||
from typing import List, Tuple
|
||||
|
||||
from .base import BaseParser
|
||||
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH
|
||||
from utils import logger
|
||||
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
|
||||
import work_parser as wp
|
||||
|
||||
|
||||
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
|
||||
"""
|
||||
Извлекает ссылки из map/area тегов или li элементов
|
||||
"""
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
|
||||
}
|
||||
|
||||
resp = requests.get(url, headers=headers, verify=verify)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
hrefs = []
|
||||
if ist_number == 1:
|
||||
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
|
||||
for a in map_tag.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
abs_url = urljoin(url, href)
|
||||
print(abs_url)
|
||||
hrefs.append(abs_url)
|
||||
else:
|
||||
for map_tag in soup.find_all("map"):
|
||||
for area in map_tag.find_all("area", href=True):
|
||||
href = area["href"]
|
||||
abs_url = urljoin(url, href)
|
||||
hrefs.append(abs_url)
|
||||
return hrefs
|
||||
|
||||
|
||||
def extract_text_from_url(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> Tuple[str, str]:
|
||||
"""
|
||||
Извлекает текст и дату из статьи второго источника (военный)
|
||||
Возвращает кортеж (текст, дата)
|
||||
"""
|
||||
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
|
||||
|
||||
soup = BeautifulSoup(response, 'html.parser')
|
||||
|
||||
# Находим контейнер div.whitecon.article
|
||||
container = soup.find("div", class_="whitecon article")
|
||||
if not container:
|
||||
return "", ""
|
||||
|
||||
# Получение заголовка <time> внутри контейнера
|
||||
time_text = container.find('span')
|
||||
time_t = ""
|
||||
if time_text:
|
||||
time_t = time_text.get_text(strip=True)
|
||||
|
||||
# Получение всех <p> внутри контейнера, исключая те с class="before_ir"
|
||||
paragraphs = container.find_all('p')
|
||||
|
||||
# Возвращаем текстовую сводку
|
||||
content_text = []
|
||||
for p in paragraphs:
|
||||
if p.get('class') != ['before_ir']:
|
||||
content_text.append(p.get_text(strip=True))
|
||||
|
||||
return "\n".join(content_text), time_t
|
||||
|
||||
|
||||
def check_url(url: str) -> bool:
|
||||
"""
|
||||
Проверяет, существует ли URL в базе данных
|
||||
"""
|
||||
try:
|
||||
response = wp.check_url_exists(url)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(result["exists"])
|
||||
return result["exists"]
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class Source2Parser(BaseParser):
|
||||
"""
|
||||
Парсер для второго источника - военный def.ltn.com.tw
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("source2")
|
||||
|
||||
def parse(self) -> None:
|
||||
"""
|
||||
Основной метод парсинга второго источника
|
||||
"""
|
||||
self.start_task('https://def.ltn.com.tw/')
|
||||
|
||||
istochnik = [
|
||||
'https://def.ltn.com.tw/breakingnewslist',
|
||||
'https://def.ltn.com.tw/list/11',
|
||||
'https://def.ltn.com.tw/list/19',
|
||||
'https://def.ltn.com.tw/list/17',
|
||||
'https://def.ltn.com.tw/list/16'
|
||||
]
|
||||
all_links = []
|
||||
|
||||
for url in istochnik:
|
||||
try:
|
||||
print(f"Сбор href из: {url}")
|
||||
all_links += extract_map_area_hrefs(url)
|
||||
except Exception as e:
|
||||
print(f"Ошибка при извлечении ссылок: {e}")
|
||||
logger.info(f"Ошибка при извлечении ссылок: {e}")
|
||||
continue
|
||||
|
||||
for hrefs in all_links:
|
||||
if not check_url(hrefs):
|
||||
try:
|
||||
text, time_text = extract_text_from_url(hrefs)
|
||||
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
|
||||
response_text = gpt_response_message(text, "source2")
|
||||
print(response_text)
|
||||
if response_text:
|
||||
update_bd_and_create_document(
|
||||
response_text=response_text,
|
||||
article_date=time_text,
|
||||
url=hrefs,
|
||||
parsed_at=str(datetime.utcnow()),
|
||||
original_text=text,
|
||||
other="source2"
|
||||
)
|
||||
except:
|
||||
continue
|
||||
|
||||
self.complete_task()
|
||||
|
||||
|
||||
def start_pars_two_istochnik() -> None:
|
||||
"""
|
||||
Точка входа для парсинга второго источника
|
||||
"""
|
||||
parser = Source2Parser()
|
||||
parser.parse()
|
||||
|
||||
|
||||
def scheduled_parser_2() -> None:
|
||||
"""
|
||||
Функция для автоматического запуска по расписанию
|
||||
"""
|
||||
start_pars_two_istochnik()
|
||||
114
parsers/universal.py
Normal file
114
parsers/universal.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Парсер любого источника - универсальный парсер
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from newspaper import Article
|
||||
from urllib.parse import urljoin, urlparse, urldefrag
|
||||
from typing import Set
|
||||
|
||||
from .base import BaseParser
|
||||
from utils import logger
|
||||
from services import gpt_response_message, update_bd_and_create_document
|
||||
import work_parser as wp
|
||||
|
||||
|
||||
def check_url(url: str) -> bool:
|
||||
"""
|
||||
Проверяет, существует ли URL в базе данных
|
||||
"""
|
||||
try:
|
||||
response = wp.check_url_exists(url)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(result["exists"])
|
||||
return result["exists"]
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class UniversalParser(BaseParser):
|
||||
"""
|
||||
Универсальный парсер для любого источника
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, promt: str):
|
||||
super().__init__("universal")
|
||||
self.url = url
|
||||
self.promt = promt
|
||||
|
||||
def parse(self) -> None:
|
||||
"""
|
||||
Основной метод парсинга любого источника
|
||||
"""
|
||||
print(f"Начало парсинга: {self.url} с промтом: {self.promt}")
|
||||
self.start_task(self.url)
|
||||
|
||||
try:
|
||||
response = requests.get(self.url)
|
||||
print(response.text)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException:
|
||||
print(f"Ошибка при запросе к {self.url}")
|
||||
self.fail_task()
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
base_domain = urlparse(self.url).netloc
|
||||
print(base_domain)
|
||||
|
||||
for a_tag in soup.find_all('a', href=True):
|
||||
href = a_tag['href'].strip()
|
||||
if not href or href.startswith('mailto:') or href.startswith('javascript:'):
|
||||
continue
|
||||
|
||||
# Приведение к абсолютному URL и удаление якорей (#...)
|
||||
abs_url = urljoin(self.url, href)
|
||||
abs_url, _ = urldefrag(abs_url)
|
||||
parsed = urlparse(abs_url)
|
||||
|
||||
# Фильтр: ссылка должна быть на тот же домен
|
||||
if parsed.netloc != base_domain:
|
||||
continue
|
||||
|
||||
print("URL:", abs_url)
|
||||
|
||||
if not check_url(abs_url) and wp.check_error_url(abs_url):
|
||||
try:
|
||||
article = Article(abs_url)
|
||||
article.download()
|
||||
article.parse()
|
||||
|
||||
if len(article.text) > 200 and article.publish_date:
|
||||
time_text = article.publish_date.strftime("%Y/%m/%d %H:%M:%S")
|
||||
|
||||
response_text = gpt_response_message(str(article.text), self.promt)
|
||||
print(response_text)
|
||||
if response_text:
|
||||
update_bd_and_create_document(
|
||||
response_text=response_text,
|
||||
article_date=time_text,
|
||||
url=abs_url,
|
||||
parsed_at=str(datetime.now()),
|
||||
original_text=article.text,
|
||||
other=self.promt
|
||||
)
|
||||
else:
|
||||
wp.add_error_url(self.url, abs_url)
|
||||
except Exception as e:
|
||||
print(f"Ошибка при обработке статьи {abs_url}: {e}")
|
||||
logger.info(f"Ошибка при обработке статьи {abs_url}: {e}")
|
||||
continue
|
||||
|
||||
self.complete_task()
|
||||
|
||||
|
||||
def start_pars_all_istochnik(url: str, promt: str) -> None:
|
||||
"""
|
||||
Точка входа для парсинга любого источника
|
||||
"""
|
||||
parser = UniversalParser(url, promt)
|
||||
parser.parse()
|
||||
Reference in New Issue
Block a user