Compare commits

..

77 Commits

Author SHA1 Message Date
7f5e9df751 CORS
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-31 16:11:07 +10:00
3266e79e54 ""
All checks were successful
continuous-integration/drone Build is passing
2026-05-20 22:28:33 +10:00
55710cc961 старт коректирую
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-20 20:39:25 +10:00
63ab6ed108 добавление отслеживания количества выгрузки
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-19 22:36:49 +10:00
e9d7a2d51f правка
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-16 13:04:03 +10:00
770445feaa добавил атостарт всех сайтов
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-16 12:41:54 +10:00
656c8d9d9a исправление в выгрузке за пеиод прибавляющее один день к финишу, это связано с особенностями sql запроса
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 14:35:42 +10:00
dd1c36b9de убрал ограничение по сохранению док файла
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 12:17:16 +10:00
c9abd97efa подредактирована стабильная работа с БД
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:54:05 +10:00
a841e1f58a дебаг
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:07:46 +10:00
e5a93e307a long vigryzka 100
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:05:07 +10:00
74143fd369 url fiks
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 10:53:18 +10:00
05cd85d8b5 длина сохранения 100
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 21:25:52 +10:00
707c523b53 abrc cj[hfytybz
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 20:59:25 +10:00
83020c3124 исправляю ошибки
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 18:58:18 +10:00
d0f0ea60a8 abs_url
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 22:14:12 +10:00
c7f0ad856a rjhtrrn
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:53:14 +10:00
94b608cd6d abrc
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:50:34 +10:00
ef453f661a добавил статус
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:24:45 +10:00
46350c1c09 ремув баг
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 20:22:12 +10:00
3e2820c8ce rer
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 19:44:41 +10:00
b688336161 rebut bag
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 19:29:47 +10:00
d55e96978e retest
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-03 13:56:13 +10:00
0577f4d65c test
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-03 13:48:49 +10:00
8f86c51d19 расширение выгрузки
All checks were successful
continuous-integration/drone/push Build is passing
Co-authored-by: Copilot <copilot@github.com>
2026-05-01 23:33:40 +10:00
994479fd9d ljltkfk
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-01 22:23:04 +10:00
bb35de53d4 убрал parser_bd 2026-05-01 20:41:55 +10:00
e894e7f9f5 исправил косяки
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-01 20:22:40 +10:00
f4cabfacbe добавил новые поля и работу с донесениями
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-01 19:44:51 +10:00
843bdcfefd добавление полей работы с донесениями
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-01 19:34:39 +10:00
fa51ffae76 фмкс
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-29 23:57:25 +10:00
3b75d14f4c - принт
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-29 23:45:32 +10:00
36b544ad7e добавил условия
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-29 23:32:49 +10:00
8b07841687 Обновить services/gpt_client.py
All checks were successful
continuous-integration/drone/push Build is passing
+ print gpt
2026-04-29 07:26:44 +00:00
85e42e4f3b Обновить config.py
All checks were successful
continuous-integration/drone/push Build is passing
ip
2026-04-29 06:59:29 +00:00
c8d54fcefd Обновить parsers/universal.py
All checks were successful
continuous-integration/drone/push Build is passing
- print
2026-04-29 06:09:30 +00:00
59a4677ace Обновить parsers/universal.py
All checks were successful
continuous-integration/drone/push Build is passing
- print
2026-04-29 05:21:27 +00:00
25f2c09064 сделал ревью системы
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-28 22:13:47 +10:00
c564140428 Ip
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-26 14:10:08 +10:00
17343efe8c сортировка по странам + перенос работы с бд
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-25 16:03:04 +10:00
92cb60aa1d добавил удаление
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-19 19:13:57 +10:00
9e6e1e3c98 по старому
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-17 22:18:26 +10:00
11ff411093 забыл
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-17 22:13:38 +10:00
54f88b5534 ntcn
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-17 22:06:49 +10:00
db646b3ce3 это рабочий проект
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-17 20:24:06 +10:00
343ef43079 + ip к прокси 2026-04-17 20:13:51 +10:00
f9eb047f62 прокси в gpt 2026-04-17 19:54:27 +10:00
619072e2b7 test
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-16 21:37:49 +10:00
43aed73d75 4
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 21:29:29 +10:00
4f31752a6d nfr
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 21:18:45 +10:00
6211267fe8 45
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 21:15:47 +10:00
e8adda9621 1
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 20:59:52 +10:00
b42e43d0c7 мм
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 20:53:18 +10:00
c134840ae9 Ip
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 20:41:40 +10:00
e4842934dc rr
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-16 19:30:54 +10:00
b1e6dcc2a1 -логи
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 14:06:22 +10:00
728389f908 Merge branch 'main' of https://gitea.allowlgroup.ru/allowlgroup/parser
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-16 13:40:56 +10:00
3dace39b14 изменение отслеживания 2026-04-16 13:40:06 +10:00
cec8829291 настройка 2026-04-16 13:39:08 +10:00
14f23d32a6 Обновить main.py
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone Build is passing
test
2026-04-14 05:28:31 +00:00
cc18b98946 парсер всех источников
Some checks failed
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is failing
2026-04-12 19:11:49 +10:00
4405400715 все источники
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 22:16:33 +10:00
662cfa0994 sourse
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:33:43 +10:00
89da884e66 expose_headers=["*"],
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:20:57 +10:00
42157e67e1 исправил ошибки
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:14:25 +10:00
15f637eb33 async
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 20:35:02 +10:00
be7b042e7c перевел chek_url в базу данных проекта, то бы не взаимодействовать со сторонним сервисом
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 20:27:48 +10:00
9fd823d8d1 еще
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 16:04:39 +10:00
5c0cdd03e7 добавлена проверка
Some checks failed
continuous-integration/drone/push Build was killed
2026-04-11 16:03:53 +10:00
4098ac7d8d может так
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 15:52:18 +10:00
5805ab0fe2 исправил url_ist
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 15:39:11 +10:00
f39ade5245 ytljxtns
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 14:10:05 +10:00
bf8bc173a1 доработка
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 14:07:01 +10:00
38ec470b67 поправил правила хранения промтов
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 14:06:12 +10:00
7e05dda3cb изменил старт первого источника
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 13:50:51 +10:00
4720aa05bc убрал закрытия подключения
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 13:11:42 +10:00
f53121aa09 commit
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 12:56:45 +10:00
21 changed files with 1550 additions and 702 deletions

View File

@@ -43,7 +43,7 @@ steps:
- docker stop parser || true - docker stop parser || true
- docker rm parser || true - docker rm parser || true
- docker pull gitea.allowlgroup.ru/allowlgroup/parser:latest - docker pull gitea.allowlgroup.ru/allowlgroup/parser:latest
- docker run -d --name parser -p 8001:8001 -v /opt/parser_data:/app/documents gitea.allowlgroup.ru/allowlgroup/parser:latest - docker run -d --name parser --network host -p 8001:8001 -v /opt/parser_data:/app/documents gitea.allowlgroup.ru/allowlgroup/parser:latest
when: when:
branch: branch:
- main - main

20
api/__init__.py Normal file
View File

@@ -0,0 +1,20 @@
"""
API модуль приложения
"""
from .schemas import (
ParserOneRequest,
Parserall,
Parserall_url,
Source,
DownloadRange
)
from .routes import setup_routes
__all__ = [
'ParserOneRequest',
'Parserall',
'Parserall_url',
'Source',
'DownloadRange',
'setup_routes'
]

224
api/routes.py Normal file
View File

@@ -0,0 +1,224 @@
"""
API эндпоинты приложения
"""
import os
import zipfile
from datetime import datetime, timedelta
from typing import List
from fastapi import BackgroundTasks, FastAPI, Query, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from config import DOCUMENTS_DIR, APP_TITLE, APP_DESCRIPTION, APP_VERSION
from utils import logger
from api.schemas import ParserOneRequest, Parserall, Source, DownloadRange, DownloadCountsResponse
from parsers import start_pars_one_istochnik, start_pars_two_istochnik, start_pars_all_istochnik
import work_parser as wp
def setup_routes(app: FastAPI) -> None:
"""
Настройка всех API маршрутов
"""
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:5173", "https://allowlgroup.ru"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ==================== Парсеры ====================
@app.post("/parser_1", summary="Запуск процесса парсинга первого источника")
async def process_parser_one_ist(data: ParserOneRequest, background_tasks: BackgroundTasks):
istochnik = data.time.split("-")
background_tasks.add_task(start_pars_one_istochnik, istochnik)
return {"message": "Процесс парсинга 1 источника запущен"}
@app.post("/parser_2", summary="Запуск процесса парсинга второго источника")
async def process_parser_two_ist(background_tasks: BackgroundTasks):
background_tasks.add_task(start_pars_two_istochnik)
return {"message": "Процесс парсинга 2 источника запущен"}
@app.post("/add_sources", summary="Добавление парсинга любого источника")
async def add_sources_all_ist(sources: Parserall):
result = wp.add_sources(str(sources.url), sources.promt)
return {"status": "success", "message": "Источник добавлен", "data": result}
@app.get("/all_sources", summary="Метод получения всех источников")
async def get_all_sources(category: str = "all"):
return wp.get_all_sources(category)
@app.delete("/delete_sources", summary="Метод удаления источника")
async def delete_sources(url: str):
return print(wp.delete_sources(url))
@app.post("/parser_all", summary="Запуск процесса парсинга любого источника")
async def process_parser_all_ist(url: Parserall, background_tasks: BackgroundTasks):
background_tasks.add_task(start_pars_all_istochnik, str(url.url), url.promt)
return {"message": "Процесс парсинга любого источника запущен"}
@app.get("/get_tasks_offset", summary="Метод получения задач парсинга")
async def get_tasks_offset(limit: int = Query(10, gt=0), offset: int = Query(0, ge=0)):
return wp.get_tasks_offset(limit, offset)
# ==================== Настройки ====================
@app.get("/settings", summary="Метод получения настроек парсера")
async def get_settings():
return wp.get_all_promt()
@app.get("/categories_promt", summary="Метод получения categories_promt")
async def get_categories_promt():
return wp.get_all_categories_promt()
@app.post("/settings", summary="Метод сохранения настроек парсера")
async def set_settings(settings: Source):
return wp.update_promt(settings.name, settings.promt)
# ==================== Задачи ====================
@app.delete("/delete_task/{task_id}", summary="Метод удаления задачи")
async def delete_task(task_id: int):
return print(wp.delete_task(task_id))
# ==================== Файлы ====================
@app.get("/file_download", summary="Метод для скачивания файла")
async def download_file(path: str, title: str):
file_name = f"{title}.docx"
file_path = os.path.join(DOCUMENTS_DIR, path, file_name)
logger.warning(f"Файл: {file_path}")
if not os.path.exists(file_path):
logger.warning(f"Файл не найден: {file_path}")
return {"error": "Файл не найден", "path": file_path}
response = FileResponse(
path=file_path,
filename=file_name,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
logger.warning(response)
return response
@app.post("/download_all", summary="Скачать все файлы за период")
async def download_all(dates: DownloadRange, background_tasks: BackgroundTasks):
date_start_str = dates.data_start
date_finish_str = dates.data_finish
field_name = getattr(dates, 'field_name', 'status') # Поле для фильтрации (по умолчанию 'status')
try:
start_date = datetime.strptime(date_start_str, "%Y-%m-%d")
finish_date = datetime.strptime(date_finish_str, "%Y-%m-%d") + timedelta(days=1)
except ValueError:
return {"error": "Неверный формат даты. Используйте YYYY-MM-DD"}
if start_date > finish_date:
return {"error": "Дата начала не может быть позже даты окончания"}
# 1. Получаем список заголовков из БД
start_date_str = start_date.strftime("%Y-%m-%d")
finish_date_str = finish_date.strftime("%Y-%m-%d")
try:
titles_from_db = wp.get_articles_by_filter(field_name, start_date_str, finish_date_str)
except Exception as e:
return {"error": f"Ошибка при получении данных из БД: {e}"}
if not titles_from_db:
return {"error": "Нет статей с выбранным фильтром за указанный период", "field_name": field_name}
# 2. Собираем все файлы .docx за период
all_files = []
current_date = start_date
while current_date <= finish_date :
date_path = current_date.strftime("%Y/%m/%d")
full_dir_path = os.path.join(DOCUMENTS_DIR, date_path)
if os.path.exists(full_dir_path):
for file in os.listdir(full_dir_path):
if file.endswith('.docx'):
file_title = file[:-5] # убираем расширение .docx
if file_title in titles_from_db:
all_files.append(os.path.join(full_dir_path, file))
current_date += timedelta(days=1)
if not all_files:
return {"error": "Файлы не найдены за указанный период",
"date_start": date_start_str,
"date_finish": date_finish_str,
"titles_found": len(titles_from_db)}
archive_name = f"documents_{date_start_str}_{date_finish_str}.zip"
archive_path = os.path.join(DOCUMENTS_DIR, archive_name)
try:
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in all_files:
zipf.write(file_path, os.path.basename(file_path))
except Exception as e:
logger.error(f"Ошибка создания архива: {e}")
return {"error": f"Ошибка создания архива: {e}"}
def cleanup_archive():
try:
if os.path.exists(archive_path):
os.remove(archive_path)
logger.info(f"Архив удалён: {archive_path}")
except Exception as e:
logger.warning(f"Не удалось удалить архив: {e}")
def mark_as_downloaded():
try:
wp.mark_articles_as_downloaded(titles_from_db)
logger.info(f"Статьи помечены как скачанные: {len(titles_from_db)} записей")
except Exception as e:
logger.error(f"Ошибка при обновлении download: {e}")
response = FileResponse(
path=archive_path,
filename=archive_name,
media_type="application/zip"
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
response.headers["Access-Control-Expose-Headers"] = "Content-Disposition"
background_tasks.add_task(cleanup_archive)
background_tasks.add_task(mark_as_downloaded)
return response
# ==================== Выгрузка (download) ====================
@app.get("/download_counts", summary="Получить количество статей для выгрузки", response_model=DownloadCountsResponse)
async def get_download_counts():
"""
Возвращает количество статей для каждого поля (tematik, svodka, donesenie, bilutene, status),
где значение поля = TRUE и download = FALSE
"""
return wp.get_download_counts()
# @app.post("/mark_downloaded", summary="Отметить статьи как скачанные")
# async def mark_articles_as_downloaded(titles: List[str]):
# """
# Обновляет поле download = TRUE для списка заголовков статей
# """
# return wp.mark_articles_as_downloaded(titles)
@app.get("/logs", summary="Показать логи")
async def get_logs():
with open("app.log", "r") as file:
lines = file.readlines()[-10:]
return {"logs": lines}

42
api/schemas.py Normal file
View File

@@ -0,0 +1,42 @@
"""
Pydantic схемы для API
"""
from pydantic import BaseModel, HttpUrl
from typing import List
class ParserOneRequest(BaseModel):
"""Запрос для парсинга первого источника"""
time: str
class Parserall(BaseModel):
"""Запрос для парсинга любого источника"""
url: HttpUrl
promt: str
class Parserall_url(BaseModel):
"""Запрос URL для источника"""
url: HttpUrl
class Source(BaseModel):
"""Модель источника для настроек"""
name: str
promt: str
class DownloadRange(BaseModel):
"""Диапазон дат для скачивания файлов"""
data_start: str
data_finish: str
field_name: str = "status"
class DownloadCountsResponse(BaseModel):
"""Ответ с количеством статей для выгрузки по каждому полю"""
tematik: int
svodka: int
donesenie: int
bilutene: int

37
config.py Normal file
View File

@@ -0,0 +1,37 @@
"""
Конфигурация приложения
"""
import os
# Пути
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DOCUMENTS_DIR = os.path.join(BASE_DIR, "documents")
LOG_FILE = os.path.join(BASE_DIR, "app.log")
# GPT сервер
# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://45.129.78.228:8484')
GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8484')
# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:5500')
# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8080')
# Прокси
PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt"
# FastAPI настройки
APP_TITLE = "Parser API"
APP_DESCRIPTION = "API для запуска парсинга в базу данных"
APP_VERSION = "1.0"
UVICORN_PORT = 8001
# Настройки парсера
PARSER_TIMEOUT = 10
GPT_TIMEOUT = 60
GPT_MAX_RETRIES = 5
MAX_ARTICLE_TEXT_LENGTH = 4500
MIN_ARTICLE_TEXT_LENGTH = 100
MIN_UNIVERSAL_ARTICLE_TEXT_LENGTH = 200
# Планировщик
SCHEDULED_PARSER_1_HOUR = 0
SCHEDULED_PARSER_1_MINUTE = 0
SCHEDULED_PARSER_2_HOUR = 1
SCHEDULED_PARSER_2_MINUTE = 0

642
main.py
View File

@@ -1,607 +1,79 @@
# Стандартные библиотеки (stdlib) """
import json Parser API - Точка входа приложения
import logging """
import os
import subprocess
import time
from datetime import datetime as dt
from datetime import datetime, timedelta
import random
import zipfile
import tempfile
# Сторонние библиотеки (third-party)
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from bs4 import BeautifulSoup
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from docx import Document from fastapi import FastAPI
from newspaper import Article
from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from pydantic import BaseModel, HttpUrl
from urllib.parse import urljoin, urlparse, urldefrag
import uvicorn import uvicorn
from work_parser import get_true_sources
import requests from config import (
APP_TITLE,
APP_DESCRIPTION,
APP_VERSION,
UVICORN_PORT,
SCHEDULED_PARSER_1_HOUR,
SCHEDULED_PARSER_1_MINUTE,
SCHEDULED_PARSER_2_HOUR,
SCHEDULED_PARSER_2_MINUTE
)
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from parsers import scheduled_parser_1, scheduled_parser_2, scheduled_parser_universal
from api import setup_routes
# Локальные импорты
# import settings_work as sw
import work_parser as wp
DOCUMENTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documents")
# Инициализация планировщика
scheduler = AsyncIOScheduler()
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Управление жизненным циклом приложения""" """Управление жизненным циклом приложения"""
# Startup # Startup
scheduler.add_job(scheduled_parser_1, "cron", hour=0, minute=0) scheduler.add_job(scheduled_parser_1, "cron", hour=SCHEDULED_PARSER_1_HOUR, minute=SCHEDULED_PARSER_1_MINUTE)
scheduler.add_job(scheduled_parser_2, "cron", hour=1, minute=0) scheduler.add_job(scheduled_parser_2, "cron", hour=SCHEDULED_PARSER_2_HOUR, minute=SCHEDULED_PARSER_2_MINUTE)
# Получаем все источники и распределяем их равномерно по 24 часам
sources = get_true_sources().items()
num_sources = len(sources)
if num_sources > 0:
total_minutes_per_day = 24 * 60
minutes_per_source = total_minutes_per_day / num_sources
for idx, (url, promt) in enumerate(sources):
total_minutes = int(idx * minutes_per_source)
scheduled_hour = total_minutes // 60
scheduled_minute = total_minutes % 60
# Для универсального парсера нужно передавать url и promt как аргументы
scheduler.add_job(
scheduled_parser_universal,
"cron",
hour=scheduled_hour,
minute=scheduled_minute,
args=[url, promt]
)
scheduler.start() scheduler.start()
yield yield
# Shutdown # Shutdown
scheduler.shutdown() scheduler.shutdown()
app = FastAPI(title="Parser API",
description="API для запуска парсинга в базу данных",
version="1.0",
lifespan=lifespan)
# Инициализация планировщика
scheduler = AsyncIOScheduler()
# Настройка логгера # Создание приложения FastAPI
logging.basicConfig(filename="app.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") app = FastAPI(
logger = logging.getLogger(__name__) title=APP_TITLE,
description=APP_DESCRIPTION,
# Инициализация таблицы статуса парсинга version=APP_VERSION,
# wp.create_table() lifespan=lifespan
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # или список разрешенных адресов, например ["https://allowlgroup.ru","http://localhost:5173", "http://45.129.78.228:8000"]
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
) )
PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt" # Настройка маршрутов
setup_routes(app)
def download_proxies(url):
response = requests.get(url)
if response.status_code == 200:
proxies = response.text.splitlines() # список прокси по строкам
return proxies
else:
return []
def fetch_with_proxy(url, proxy, verify, timeout):
proxies = {
'http': f'http://{proxy}',
'https': f'http://{proxy}',
}
try:
response = requests.get(url, proxies=proxies, timeout=timeout, verify=verify)
response.encoding = 'utf-8'
if response.status_code == 200:
# Проверяем содержимое - если это ошибка от прокси
if '"message":"Request failed' in response.text or '403' in response.text[:500]:
print(f"Proxy {proxy} - Site returned 403 (inside response)")
return None
print(f"Proxy {proxy} - SUCCESS")
return response.text
elif response.status_code == 403:
print(f"Proxy {proxy} - 403 Forbidden")
return None # Прокси работает, но сайт блокирует
else:
print(f"Proxy {proxy} - Status {response.status_code}")
return None
except:
return None
# Перемешивает список прокси для случайного начала
def get_shuffled_proxies(proxies_list):
shuffled = proxies_list.copy()
random.shuffle(shuffled)
return shuffled
# Общие функции нахождения ссылок
def extract_map_area_hrefs(url, verify=True, ist_number=1):
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
}
resp = requests.get(url, headers=headers, verify=verify)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = []
if ist_number == 1:
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
for a in map_tag.find_all("a", href=True):
href = a["href"]
abs_url = urljoin(url, href)
print(abs_url)
hrefs.append(abs_url)
else:
for map_tag in soup.find_all("map"):
for area in map_tag.find_all("area", href=True):
href = area["href"]
abs_url = urljoin(url, href)
hrefs.append(abs_url)
return hrefs
# функции парсера первого источника (газета)
def extract_text_from_url_one(url, timeout=10, verify=True):
proxies_list = download_proxies(PROXIES_URL)
proxies_list = get_shuffled_proxies(proxies_list)
response = ""
for proxy in proxies_list:
response = fetch_with_proxy(url, proxy=proxy, timeout=timeout, verify=verify)
if response:
break
else:
response = ""
soup = BeautifulSoup(response, "html.parser")
title_div = soup.find('div', class_='newsdetatit')
title_text = ''
if title_div:
h3_tag = title_div.find('h3')
if h3_tag:
title_text = h3_tag.get_text(strip=True)
content_div = soup.find('div', class_='newsdetatext')
content_text = ''
if content_div:
founder_content = content_div.find('founder-content')
if founder_content:
p_tags = founder_content.find_all('p')
content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)
text = title_text + content_text
if len(text) > 4500:
text = text[:4500]
print(len(text))
return text
#Функции парсера второго источника (военного)
def extract_text_from_url(url, timeout=10, verify=True):
proxies_list = download_proxies(PROXIES_URL)
proxies_list = get_shuffled_proxies(proxies_list)
response = ""
for proxy in proxies_list:
response = fetch_with_proxy(url, proxy=proxy, timeout=timeout, verify=verify)
if response:
break
else:
response = ""
soup = BeautifulSoup(response, 'html.parser')
# Находим контейнер div.whitecon.article
container = soup.find("div", class_="whitecon article")
if not container:
return "", ""
# Получение заголовка <time> внутри контейнера
time_text = container.find('span')
if time_text:
time_t= time_text.get_text(strip=True)
# Получение всех <p> внутри контейнера, исключая те с class="before_ir"
paragraphs = container.find_all('p')
# Возвращаем текстовую сводку
content_text = []
for p in paragraphs:
if p.get('class') != ['before_ir'] :
content_text.append(p.get_text(strip=True))
return "\n".join(content_text), time_t
# Общий запрос на GPT
def gpt_response_message(content: str, name_promt: str):
contentGPT = wp.get_promt(name_promt).replace('{content}', content)
url = 'http://45.129.78.228:8484' #10.8.0.14:5500
params = {'text': contentGPT}
# Ограничение количества попыток
max_retries = 3
retries = 0
while retries < max_retries:
try:
response = requests.get(url, params=params, timeout=15)
return response.text
except Exception as ex:
print(f"Ошибка при запросе к GPT: {ex}")
logger.info(f"gpt_response_message: {ex}")
retries += 1
logger.info(f"Привышен лимит запросов {max_retries}")
return ""
# Общие функции проверки ссылок
def check_url(url):
print(url)
response = requests.get('http://45.129.78.228:8002/check_url_exists', params={'url': url})
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
print(f"Ошибка: {response.status_code}")
return False
# функции даты первого источника (газета)
def create_folder(num):
if int(num) // 10 == 0:
num = f"0{num}"
else:
num = str(num)
return num
# Функция формирования документа
def update_bd_and_create_document(response_text, article_date, url, parsed_at, original_text, other):
clean_response = ''
if not response_text:
print(f"Пустой ответ от GPT для URL: {url}")
logger.info(f"Пустой ответ от GPT для URL: {url}")
return
try:
clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data = json.loads(clean_response)
if data['category']:
data['article_date'] = article_date
data['url'] = url
data['parsed_at'] = parsed_at
data['original_text'] = original_text
data['status'] = False
data['viewed'] = False
data['other'] = other
print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data))
path_day = article_date.split()[0]
documents_path = os.path.join(DOCUMENTS_DIR, path_day)
if not os.path.exists(documents_path):
os.makedirs(documents_path)
print(f"Создана папка: {documents_path}")
doc = Document()
doc.add_heading('Ссылка на статью', level=1)
doc.add_paragraph(other)
doc.add_heading('Дата и время', level=1)
doc.add_paragraph(article_date)
doc.add_heading('Обноруженные тематики текста', level=1)
doc.add_paragraph(data["category"])
doc.add_heading('Заголовок', level=1)
doc.add_paragraph(data["title"])
doc.add_heading('Краткий пересказ', level=1)
doc.add_paragraph(data["short_text"])
doc.add_heading('Переведенный текст статьи в газете', level=1)
doc.add_paragraph(data["translation_text"])
doc.add_heading('Оригинальный текст', level=1)
doc.add_paragraph(original_text)
doc_name = f"{data['title']}.docx"
doc_path = os.path.join(documents_path, doc_name)
doc.save(doc_path)
print(f"Сохранен документ: {doc_path}")
except Exception as ex:
print(f"Ошибка при обработке ответа GPT: {ex}")
logger.info(f"Ошибка при обработке ответа GPT: {ex}")
#Функции start первого источника (газета)
def start_pars_one_istochnik(data_init=""):
if data_init != ['']:
current_day = data_init[0]
current_month = data_init[1]
current_year = data_init[2]
else:
datetime_now = dt.now()
current_day = create_folder(datetime_now.day)
current_month = create_folder(datetime_now.month)
current_year = f"{datetime_now.year}"
task_id = wp.insert_task(status='queued', source_url=f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html')
print("Создана задача с id:", task_id)
for page_number in range(1, 9):
url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
wp.update_task(task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
print(f"Сбор href из: {url}")
try:
hrefs = extract_map_area_hrefs(url, ist_number=2)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"extract_map_area_hrefs: {e}")
continue
for i, link in enumerate(hrefs, 1):
if check_url(link) == False:
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
text = extract_text_from_url_one(link)
if len(text) >= 100:
response_text = gpt_response_message(text, url_ist = "http://epaper.hljnews.cn/hljrb/pc/layout")
print(response_text)
if response_text:
update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other=url)
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
#Функции start второго источника (военного)
def start_pars_two_istochnik():
task_id = wp.insert_task(status='queued', source_url=f'https://def.ltn.com.tw/')
istochnik = ['https://def.ltn.com.tw/breakingnewslist', 'https://def.ltn.com.tw/list/11', 'https://def.ltn.com.tw/list/19', 'https://def.ltn.com.tw/list/17','https://def.ltn.com.tw/list/16']
all_links = []
for url in istochnik:
try:
print(f"Сбор href из: {url}")
all_links += extract_map_area_hrefs(url)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"Ошибка при извлечении ссылок: {e}")
continue
for hrefs in all_links:
if check_url(hrefs) == False:
try:
text, time_text = extract_text_from_url(hrefs)
if len(text) >= 100:
response_text = gpt_response_message(text, url_ist = "https://def.ltn.com.tw/breakingnewslist")
print(response_text)
if response_text:
update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other=url)
except:
continue
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
#Функции start любого источника
def start_pars_all_istochnik(url:str, promt:str):
task_id = wp.insert_task(status='queued', source_url=url)
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException:
return set()
soup = BeautifulSoup(response.text, 'html.parser')
base_domain = urlparse(url).netloc
# links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href'].strip()
if not href or href.startswith('mailto:') or href.startswith('javascript:'):
continue
# Приведение к абсолютному URL и удаление якорей (#...)
abs_url = urljoin(url, href)
abs_url, _ = urldefrag(abs_url)
parsed = urlparse(abs_url)
# Фильтр: ссылка должна быть на тот же домен
if parsed.netloc != base_domain:
continue
# Фильтрация по ключевым словам (пример для новостных сайтов)
# path_lower = parsed.path.lower()
# if any(keyword in path_lower for keyword in ['/news/', 'article', '2026', '2027', '/blog/', '/post/']):
print(f"Парсинг {abs_url}")
if check_url(abs_url) == False and wp.check_error_url(abs_url):
try:
article = Article(abs_url)
article.download()
article.parse()
if len(article.text) > 200 and article.publish_date:
time_text = article.publish_date.strftime("%Y/%m/%d %H:%M:%S")
print("URL:", abs_url)
print("Заголовок:", article.title)
print("Дата публикации:", time_text)
print("Текст статьи:", article.text)
response_text = gpt_response_message(str(article.text), promt)
print(response_text)
if response_text:
update_bd_and_create_document(response_text=response_text, article_date=time_text, url=abs_url, parsed_at=str(dt.now()), original_text=article.text, other=url)
else:
wp.add_error_url(url, abs_url)
except Exception as e:
print(f"Ошибка при обработке статьи {abs_url}: {e}")
logger.info(f"Ошибка при обработке статьи {abs_url}: {e}")
continue # Продолжаем со следующей статьей
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
# start_pars_all_istochnik("https://www.asahi.com", "japan")
# Функции для автоматического запуска
def scheduled_parser_1():
start_pars_one_istochnik()
def scheduled_parser_2():
start_pars_two_istochnik()
class ParserOneRequest(BaseModel):
time: str
@app.post("/parser_1", summary="Запуск процесса парсинга первого источника")
async def process_parser_one_ist(data: ParserOneRequest, background_tasks: BackgroundTasks):
istochnik = data.time.split("-")
background_tasks.add_task(start_pars_one_istochnik(istochnik))
return {"message": "Процесс парсинга 1 источника запущен"}
@app.post("/parser_2" , summary="Запуск процеса парсинга второго источника")
async def process_parser_two_ist(background_tasks: BackgroundTasks):
background_tasks.add_task(start_pars_two_istochnik)
return {"message": "Процесс парсинга 2 источника запущен"}
class Parserall(BaseModel):
url: HttpUrl
promt: str
@app.post("/add_sources" , summary="Добавление парсинга любого источника")
async def add_sources_all_ist(sources: Parserall):
return wp.add_sources(sources.url, sources.promt)
@app.post("/parser_all" , summary="Запуск процеса парсинга любого источника")
async def process_parser_all_ist(url: Parserall, background_tasks: BackgroundTasks):
background_tasks.add_task(start_pars_all_istochnik(str(url.url), url.promt))
return {"message": "Процесс парсинга любого источника запущен"}
# GET метод для получения
@app.get("/get_tasks_offset", summary="Метод получения задач парсинга")
def get_tasks_offset(limit: int = Query(10, gt=0), offset: int = Query(0, ge=0)):
return wp.get_tasks_offset(limit, offset)
# GET метод для получения настроек
@app.get("/settings", summary="Метод получения настроек парсера")
def get_settings():
return wp.get_all_promt()
@app.get("/categories_promt", summary="Метод получения categories_promt")
def get_categories_promt():
return wp.get_all_categories_promt()
class Source(BaseModel):
url: str
name: str
promt: str
# POST метод для установки настроек
@app.post("/settings", summary="Метод сохранения настроек парсера")
def set_settings(settings: Source):
return wp.update_promt(settings.url, settings.name, settings.promt)
@app.delete("/delete_task/{task_id}", summary="Метод удаления задачи")
def delete_task(task_id: int):
return print(wp.delete_task(task_id))
@app.get("/file_download", summary="Метод для скачивания файла")
async def download_file(path: str, title: str):
file_name = f"{title}.docx"
file_path = os.path.join(DOCUMENTS_DIR, path, file_name)
logger.warning(f"Файл: {file_path}")
# Проверяем существование файла
if not os.path.exists(file_path):
logger.warning(f"Файл не найден: {file_path}")
return {"error": "Файл не найден", "path": file_path}
# Возвращаем файл
response = FileResponse(
path=file_path,
filename=file_name,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, OPTIONS"
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
logger.warning(response)
return response
class DownloadRange(BaseModel):
data_start: str
data_finish: str
@app.post("/download_all", summary="Скачать все файлы за период")
async def download_all(dates: DownloadRange, background_tasks: BackgroundTasks):
date_start = dates.data_start
date_finish = dates.data_finish
try:
start_date = datetime.strptime(date_start, "%Y-%m-%d")
finish_date = datetime.strptime(date_finish, "%Y-%m-%d")
except ValueError:
return {"error": "Неверный формат даты. Используйте YYYY-MM-DD"}
if start_date > finish_date:
return {"error": "Дата начала не может быть позже даты окончания"}
all_files = []
current_date = start_date
while current_date <= finish_date:
date_path = current_date.strftime("%Y/%m/%d")
full_dir_path = os.path.join(DOCUMENTS_DIR, date_path)
# logger.info(f"Проверяем путь: {full_dir_path}")
if os.path.exists(full_dir_path):
for file in os.listdir(full_dir_path):
if file.endswith('.docx'):
all_files.append(os.path.join(full_dir_path, file))
current_date += timedelta(days=1)
# logger.info(f"Найдено файлов: {len(all_files)}")
if not all_files:
return {"error": "Файлы не найдены за указанный период", "date_start": date_start, "date_finish": date_finish}
# Создаём архив
archive_name = f"documents_{date_start}_{date_finish}.zip"
archive_path = os.path.join(DOCUMENTS_DIR, archive_name)
try:
with zipfile.ZipFile(archive_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path in all_files:
zipf.write(file_path, os.path.basename(file_path))
except Exception as e:
logger.error(f"Ошибка создания архива: {e}")
return {"error": f"Ошибка создания архива: {e}"}
# logger.info(f"Архив создан: {archive_path}")
# Функция для удаления архива после отдачи
def cleanup_archive():
try:
if os.path.exists(archive_path):
os.remove(archive_path)
logger.info(f"Архив удалён: {archive_path}")
except Exception as e:
logger.warning(f"Не удалось удалить архив: {e}")
# Возвращаем архив
response = FileResponse(
path=archive_path,
filename=archive_name,
media_type="application/zip"
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
response.headers["Access-Control-Expose-Headers"] = "Content-Disposition"
# Удаляем архив после отправки через background task
background_tasks.add_task(cleanup_archive)
return response
@app.get("/logs", summary="Показать логи")
def get_logs():
with open("app.log", "r") as file:
lines = file.readlines()[-10:] # последние 10 строк
return {"logs": lines}
if __name__ == "__main__": if __name__ == "__main__":
uvicorn.run("main:app", port=8001, reload=True) uvicorn.run("main:app", port=UVICORN_PORT, reload=True)

9
models/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
"""
Модели данных
Импортируем ParsedData из parser_bd для обратной совместимости
"""
import parser_bd as pbd
ParsedData = pbd.ParsedData
__all__ = ['ParsedData']

20
parsers/__init__.py Normal file
View File

@@ -0,0 +1,20 @@
"""
Парсеры приложения
"""
from .base import BaseParser
from .source1 import Source1Parser, start_pars_one_istochnik, scheduled_parser_1
from .source2 import Source2Parser, start_pars_two_istochnik, scheduled_parser_2
from .universal import UniversalParser, start_pars_all_istochnik, scheduled_parser_universal
__all__ = [
'BaseParser',
'Source1Parser',
'start_pars_one_istochnik',
'scheduled_parser_1',
'Source2Parser',
'start_pars_two_istochnik',
'scheduled_parser_2',
'UniversalParser',
'start_pars_all_istochnik',
'scheduled_parser_universal'
]

47
parsers/base.py Normal file
View File

@@ -0,0 +1,47 @@
"""
Базовый класс парсера
"""
from abc import ABC, abstractmethod
from typing import List
import work_parser as wp
class BaseParser(ABC):
"""
Базовый класс для всех парсеров
"""
def __init__(self, source_name: str):
self.source_name = source_name
self.task_id = None
def start_task(self, source_url: str) -> int:
"""
Создаёт задачу парсинга и возвращает её ID
"""
self.task_id = wp.insert_task(status='queued', source_url=source_url)
print(f"Создана задача с id: {self.task_id}")
return self.task_id
def complete_task(self) -> None:
"""
Завершает задачу парсинга
"""
if self.task_id:
from datetime import datetime
wp.update_task(self.task_id, status='completed', finished_at=datetime.utcnow())
def fail_task(self) -> None:
"""
Отмечает задачу как неудачную
"""
if self.task_id:
from datetime import datetime
wp.update_task(self.task_id, status='failed', finished_at=datetime.utcnow())
@abstractmethod
def parse(self) -> None:
"""
Основной метод парсинга - должен быть реализован в наследниках
"""
pass

161
parsers/source1.py Normal file
View File

@@ -0,0 +1,161 @@
"""
Парсер первого источника - газета (hljnews.cn)
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
from typing import List
from .base import BaseParser
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH, MAX_ARTICLE_TEXT_LENGTH
from utils import logger, create_folder, get_current_date_parts
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
import work_parser as wp
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
"""
Извлекает ссылки из map/area тегов или li элементов
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
}
resp = requests.get(url, headers=headers, verify=verify)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = []
if ist_number == 1:
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
for a in map_tag.find_all("a", href=True):
href = a["href"]
abs_url = urljoin(url, href)
print(abs_url)
hrefs.append(abs_url)
else:
for map_tag in soup.find_all("map"):
for area in map_tag.find_all("area", href=True):
href = area["href"]
abs_url = urljoin(url, href)
hrefs.append(abs_url)
return hrefs
def extract_text_from_url_one(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> str:
"""
Извлекает текст из статьи первого источника (газета)
"""
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
soup = BeautifulSoup(response, "html.parser")
title_div = soup.find('div', class_='newsdetatit')
title_text = ''
if title_div:
h3_tag = title_div.find('h3')
if h3_tag:
title_text = h3_tag.get_text(strip=True)
content_div = soup.find('div', class_='newsdetatext')
content_text = ''
if content_div:
founder_content = content_div.find('founder-content')
if founder_content:
p_tags = founder_content.find_all('p')
content_text = '\n'.join(p.get_text(strip=True) for p in p_tags)
text = title_text + content_text
if len(text) > MAX_ARTICLE_TEXT_LENGTH:
text = text[:MAX_ARTICLE_TEXT_LENGTH]
print(len(text))
return text
def check_url(url: str) -> bool:
"""
Проверяет, существует ли URL в базе данных
"""
try:
response = wp.check_url_exists(url)
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
return False
except Exception:
return False
class Source1Parser(BaseParser):
"""
Парсер для первого источника - газета hljnews.cn
"""
def __init__(self):
super().__init__("source1")
def parse(self, data_init: str = "") -> None:
"""
Основной метод парсинга первого источника
"""
if data_init != ['']:
current_day = data_init[2]
current_month = data_init[1]
current_year = data_init[0]
else:
current_year, current_month, current_day = get_current_date_parts()
source_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0X.html'
self.start_task(source_url)
for page_number in range(1, 9):
url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
wp.update_task(self.task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
print(f"Сбор href из: {url}")
try:
hrefs = extract_map_area_hrefs(url, ist_number=2)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"extract_map_area_hrefs: {e}")
continue
for i, link in enumerate(hrefs, 1):
if not check_url(link):
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
text = extract_text_from_url_one(link)
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
response_text = gpt_response_message(text, "source1")
print(response_text)
if response_text:
update_bd_and_create_document(
response_text=response_text,
article_date=f"{current_year}/{current_month}/{current_day}",
url=link,
parsed_at=str(datetime.now()),
original_text=text,
other="source1"
)
self.complete_task()
def start_pars_one_istochnik(data_init: str = "") -> None:
"""
Точка входа для парсинга первого источника
"""
parser = Source1Parser()
parser.parse(data_init)
def scheduled_parser_1() -> None:
"""
Функция для автоматического запуска по расписанию
"""
start_pars_one_istochnik("")

161
parsers/source2.py Normal file
View File

@@ -0,0 +1,161 @@
"""
Парсер второго источника - военный (def.ltn.com.tw)
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
from typing import List, Tuple
from .base import BaseParser
from config import PARSER_TIMEOUT, MIN_ARTICLE_TEXT_LENGTH
from utils import logger
from services import fetch_with_proxy_retry, gpt_response_message, update_bd_and_create_document
import work_parser as wp
def extract_map_area_hrefs(url: str, verify: bool = True, ist_number: int = 1) -> List[str]:
"""
Извлекает ссылки из map/area тегов или li элементов
"""
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)"
}
resp = requests.get(url, headers=headers, verify=verify)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = []
if ist_number == 1:
for map_tag in soup.find_all("li", attrs={"data-page": "1"}):
for a in map_tag.find_all("a", href=True):
href = a["href"]
abs_url = urljoin(url, href)
print(abs_url)
hrefs.append(abs_url)
else:
for map_tag in soup.find_all("map"):
for area in map_tag.find_all("area", href=True):
href = area["href"]
abs_url = urljoin(url, href)
hrefs.append(abs_url)
return hrefs
def extract_text_from_url(url: str, timeout: int = PARSER_TIMEOUT, verify: bool = True) -> Tuple[str, str]:
"""
Извлекает текст и дату из статьи второго источника (военный)
Возвращает кортеж (текст, дата)
"""
response = fetch_with_proxy_retry(url, timeout=timeout, verify=verify)
soup = BeautifulSoup(response, 'html.parser')
# Находим контейнер div.whitecon.article
container = soup.find("div", class_="whitecon article")
if not container:
return "", ""
# Получение заголовка <time> внутри контейнера
time_text = container.find('span')
time_t = ""
if time_text:
time_t = time_text.get_text(strip=True)
# Получение всех <p> внутри контейнера, исключая те с class="before_ir"
paragraphs = container.find_all('p')
# Возвращаем текстовую сводку
content_text = []
for p in paragraphs:
if p.get('class') != ['before_ir']:
content_text.append(p.get_text(strip=True))
return "\n".join(content_text), time_t
def check_url(url: str) -> bool:
"""
Проверяет, существует ли URL в базе данных
"""
try:
response = wp.check_url_exists(url)
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
return False
except Exception:
return False
class Source2Parser(BaseParser):
"""
Парсер для второго источника - военный def.ltn.com.tw
"""
def __init__(self):
super().__init__("source2")
def parse(self) -> None:
"""
Основной метод парсинга второго источника
"""
self.start_task('https://def.ltn.com.tw/')
istochnik = [
'https://def.ltn.com.tw/breakingnewslist',
'https://def.ltn.com.tw/list/11',
'https://def.ltn.com.tw/list/19',
'https://def.ltn.com.tw/list/17',
'https://def.ltn.com.tw/list/16'
]
all_links = []
for url in istochnik:
try:
print(f"Сбор href из: {url}")
all_links += extract_map_area_hrefs(url)
except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"Ошибка при извлечении ссылок: {e}")
continue
for hrefs in all_links:
if not check_url(hrefs):
try:
text, time_text = extract_text_from_url(hrefs)
if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
response_text = gpt_response_message(text, "source2")
# print(response_text)
if response_text:
update_bd_and_create_document(
response_text=response_text,
article_date=time_text,
url=hrefs,
parsed_at=str(datetime.utcnow()),
original_text=text,
other="source2"
)
except:
continue
self.complete_task()
def start_pars_two_istochnik() -> None:
"""
Точка входа для парсинга второго источника
"""
parser = Source2Parser()
parser.parse()
def scheduled_parser_2() -> None:
"""
Функция для автоматического запуска по расписанию
"""
start_pars_two_istochnik()

131
parsers/universal.py Normal file
View File

@@ -0,0 +1,131 @@
"""
Парсер любого источника - универсальный парсер
"""
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from newspaper import Article
from urllib.parse import urljoin, urlparse, urldefrag
from typing import Set
from .base import BaseParser
from utils import logger
from services import gpt_response_message, update_bd_and_create_document
import work_parser as wp
def check_url(url: str) -> bool:
"""
Проверяет, существует ли URL в базе данных
"""
try:
response = wp.check_url_exists(url)
if response.status_code == 200:
result = response.json()
# print(result["exists"])
return result["exists"]
else:
return False
except Exception:
return False
class UniversalParser(BaseParser):
"""
Универсальный парсер для любого источника
"""
def __init__(self, url: str, promt: str):
super().__init__("universal")
self.url = url
self.promt = promt
def parse(self) -> None:
"""
Основной метод парсинга любого источника
"""
print(f"Начало парсинга: {self.url} с промтом: {self.promt}")
self.start_task(self.url)
num = 0
try:
response = requests.get(self.url)
# print(response.text)
response.raise_for_status()
except requests.RequestException:
print(f"Ошибка при запросе к {self.url}")
self.fail_task()
return
soup = BeautifulSoup(response.text, 'html.parser')
base_domain = urlparse(self.url).netloc
# print(base_domain)
for a_tag in soup.find_all('a', href=True):
href = a_tag['href'].strip()
if not href or href.startswith('mailto:') or href.startswith('javascript:'):
continue
# Приведение к абсолютному URL и удаление якорей (#...)
abs_url = urljoin(self.url, href)
abs_url, _ = urldefrag(abs_url)
parsed = urlparse(abs_url)
# Фильтр: ссылка должна быть на тот же домен
if parsed.netloc != base_domain:
continue
# print(num)
if not check_url(abs_url) and wp.check_error_url(abs_url):
try:
article = Article(abs_url)
article.download()
article.parse()
print("URL:", abs_url)
if len(article.text) > 200 and article.publish_date:
num += 1
# Если дата публикации отсутствует - используем текущую
if article.publish_date:
time_text = article.publish_date.strftime("%Y/%m/%d %H:%M:%S")
print(time_text)
else:
time_text = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
print(f"Дата публикации отсутствует, используем текущую: {time_text}")
response_text = gpt_response_message(str(article.text), self.promt)
# print(response_text)
if response_text:
update_bd_and_create_document(
response_text=response_text,
article_date=time_text,
url=abs_url,
parsed_at=str(datetime.now()),
original_text=article.text,
other=self.promt
)
else:
wp.add_error_url(self.url, abs_url)
except Exception as e:
print(f"Ошибка при обработке статьи {abs_url}: {e}")
logger.info(f"Ошибка при обработке статьи {abs_url}: {e}")
continue
if num == 0:
wp.update_source_status(self.url)
self.complete_task()
def start_pars_all_istochnik(url: str, promt: str) -> None:
"""
Точка входа для парсинга любого источника
"""
parser = UniversalParser(url, promt)
parser.parse()
def scheduled_parser_universal(url: str, promt: str) -> None:
"""
Функция для автоматического запуска универсального парсера по расписанию
"""
start_pars_all_istochnik(url, promt)

20
services/__init__.py Normal file
View File

@@ -0,0 +1,20 @@
"""
Сервисы приложения
"""
from .proxy_manager import (
download_proxies,
get_shuffled_proxies,
fetch_with_proxy,
fetch_with_proxy_retry
)
from .gpt_client import gpt_response_message
from .document_builder import update_bd_and_create_document
__all__ = [
'download_proxies',
'get_shuffled_proxies',
'fetch_with_proxy',
'fetch_with_proxy_retry',
'gpt_response_message',
'update_bd_and_create_document'
]

View File

@@ -0,0 +1,98 @@
"""
Document Builder - создание JSON и DOCX файлов
"""
import json
import os
from docx import Document
from config import DOCUMENTS_DIR
from utils import logger
import work_parser as wp
# Обязательные поля для модели ParsedData
REQUIRED_FIELDS = [
'title', 'category', 'translation_text', 'short_text'
]
def update_bd_and_create_document(
response_text: str,
article_date: str,
url: str,
parsed_at: str,
original_text: str,
other: str
) -> None:
"""
Обрабатывает ответ от GPT, сохраняет в БД и создаёт DOCX документ
"""
if not response_text:
print(f"Пустой ответ от GPT для URL: {url}")
logger.info(f"Пустой ответ от GPT для URL: {url}")
return
try:
clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data = json.loads(clean_response)
# Нормализация типов: category может приходить как list, а ожидается str
if isinstance(data.get('category'), list):
data['category'] = ', '.join(data['category'])
if data['category']:
# Добавляем системные поля
data['article_date'] = article_date
data['url'] = url
data['parsed_at'] = parsed_at
data['original_text'] = original_text
data['status'] = False
data['viewed'] = False
data['tematik'] = False
data['svodka'] = False
data['donesenie'] = False
data['bilutene'] = False
data['other'] = other
data['download'] = False
# Сохранение в БД через pbd
parsed_data = wp.ParsedData(**data)
wp.save_parsed_data_to_db(parsed_data)
print("Данные успешно сохранены в БД")
# Создание DOCX документа
# path_day = parsed_at.split()[0].replace('-', '/')
path_day = article_date.split()[0]
documents_path = os.path.join(DOCUMENTS_DIR, path_day)
if not os.path.exists(documents_path):
os.makedirs(documents_path)
print(f"Создана папка: {documents_path}")
doc = Document()
doc.add_heading('Ссылка на статью', level=1)
doc.add_paragraph(url)
doc.add_heading('Дата и время', level=1)
doc.add_paragraph(article_date)
doc.add_heading('Обнаруженные тематики текста', level=1)
doc.add_paragraph(data["category"])
doc.add_heading('Заголовок', level=1)
doc.add_paragraph(data["title"])
doc.add_heading('Краткий пересказ', level=1)
doc.add_paragraph(data["short_text"])
doc.add_heading('Переведенный текст статьи в газете', level=1)
doc.add_paragraph(data["translation_text"])
doc.add_heading('Оригинальный текст', level=1)
doc.add_paragraph(original_text)
doc_name = f"{data['title']}.docx"
doc_path = os.path.join(documents_path, doc_name)
doc.save(doc_path)
print(f"Сохранен документ: {doc_path}")
except json.JSONDecodeError as ex:
print(f"Ошибка парсинга JSON от GPT для URL {url}: {ex}")
print(f"Сырой ответ: {response_text[:500]}")
logger.error(f"JSON decode error для {url}: {ex}")
except Exception as ex:
print(f"Ошибка при обработке ответа GPT для URL {url}: {ex}")
print(f"Сырой ответ: {response_text[:500]}")
logger.error(f"Ошибка при обработке ответа GPT для {url}: {ex}")

49
services/gpt_client.py Normal file
View File

@@ -0,0 +1,49 @@
"""
GPT клиент - отправка запросов к нейросети
"""
import time
import requests
from config import GPT_SERVER_URL, GPT_MAX_RETRIES, GPT_TIMEOUT
from utils import logger
import work_parser as wp
def gpt_response_message(content: str, name_promt: str) -> str:
"""
Отправляет текст на обработку GPT серверу
Возвращает ответ или пустую строку при ошибке
"""
contentGPT = wp.get_promt(name_promt).replace('{content}', content)
url = GPT_SERVER_URL
params = {'text': contentGPT}
max_retries = GPT_MAX_RETRIES
retries = 0
while retries < max_retries:
try:
response = requests.get(url, params=params, timeout=GPT_TIMEOUT)
print(response.text)
return response.text
except requests.exceptions.ConnectTimeout as e:
print(f"Ошибка подключения (timeout): {e}")
logger.warning(f"gpt_response_message timeout:")
retries += 1
if retries < max_retries:
time.sleep(2 ** (retries - 1))
except requests.exceptions.ConnectionError as e:
print(f"Ошибка соединения: {e}")
logger.warning(f"gpt_response_message connection error: ")
retries += 1
if retries < max_retries:
time.sleep(2 ** (retries - 1))
except Exception as ex:
print(f"Ошибка при запросе к GPT: {ex}")
logger.error(f"gpt_response_message: ")
retries += 1
if retries < max_retries:
time.sleep(2 ** (retries - 1))
logger.info(f"Превышен лимит запросов {max_retries}")
return ""

75
services/proxy_manager.py Normal file
View File

@@ -0,0 +1,75 @@
"""
Менеджер прокси - управление загрузкой и использованием прокси
"""
import random
import requests
from config import PROXIES_URL
def download_proxies(url: str = PROXIES_URL) -> list[str]:
"""
Загружает список прокси из удаленного источника
"""
response = requests.get(url)
if response.status_code == 200:
proxies = response.text.splitlines()
return proxies
else:
return []
def get_shuffled_proxies(proxies_list: list[str]) -> list[str]:
"""
Перемешивает список прокси для случайного начала
"""
shuffled = proxies_list.copy()
random.shuffle(shuffled)
return shuffled
def fetch_with_proxy(url: str, proxy: str, verify: bool = True, timeout: int = 10) -> str | None:
"""
Выполняет запрос к URL через прокси
Возвращает текст ответа или None при ошибке
"""
proxies = {
'http': f'http://{proxy}',
'https': f'http://{proxy}',
}
try:
response = requests.get(url, proxies=proxies, timeout=timeout, verify=verify)
response.encoding = 'utf-8'
if response.status_code == 200:
# Проверяем содержимое - если это ошибка от прокси
if '"message":"Request failed' in response.text or '403' in response.text[:500]:
print(f"Proxy {proxy} - Site returned 403 (inside response)")
return None
print(f"Proxy {proxy} - SUCCESS")
return response.text
elif response.status_code == 403:
print(f"Proxy {proxy} - 403 Forbidden")
return None # Прокси работает, но сайт блокирует
else:
print(f"Proxy {proxy} - Status {response.status_code}")
return None
except:
return None
def fetch_with_proxy_retry(url: str, timeout: int = 10, verify: bool = True) -> str:
"""
Выполняет запрос с перебором прокси до успешного
Возвращает пустую строку если все прокси не сработали
"""
proxies_list = download_proxies(PROXIES_URL)
proxies_list = get_shuffled_proxies(proxies_list)
response = ""
for proxy in proxies_list:
response = fetch_with_proxy(url, proxy=proxy, timeout=timeout, verify=verify)
if response:
break
else:
response = ""
return response

View File

@@ -1,39 +0,0 @@
from pydantic import BaseModel
from typing import List
import json
# Модель для источника
class Source(BaseModel):
name: str
url: str
prompt: str
# Модель для настроек (список источников)
class Settings(BaseModel):
sources: List[Source]
# Путь к файлу с настройками
SETTINGS_FILE = "config.json"
# Чтение настроек из файла
def read_settings() -> Settings:
try:
with open(SETTINGS_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
return Settings(**data)
except (FileNotFoundError, json.JSONDecodeError):
return Settings(sources=[])
# Запись настроек в файл
def write_settings(settings: Settings):
with open(SETTINGS_FILE, "w", encoding="utf-8") as f:
json.dump(settings.dict(), f, ensure_ascii=False, indent=2)
# Обновление данных по источнику
def update_source(new_source: Source) -> dict:
settings = read_settings()
for i, source in enumerate(settings.sources):
if source.name == new_source.name:
settings.sources[i] = new_source
write_settings(settings)
return {"code": 0, "message": f"Источник '{new_source.name}' успешно обновлен."}
return {"code": 1, "message": f"Источник с именем '{new_source.name}' не найден."}

12
utils/__init__.py Normal file
View File

@@ -0,0 +1,12 @@
"""
Утилиты приложения
"""
from .logger import setup_logger, logger
from .helpers import create_folder, get_current_date_parts
__all__ = [
'setup_logger',
'logger',
'create_folder',
'get_current_date_parts'
]

25
utils/helpers.py Normal file
View File

@@ -0,0 +1,25 @@
"""
Общие вспомогательные функции
"""
from datetime import datetime as dt
def create_folder(num: int) -> str:
"""
Форматирует номер дня/месяца для имени папки (добавляет ведущий ноль)
"""
if int(num) // 10 == 0:
return f"0{num}"
else:
return str(num)
def get_current_date_parts() -> tuple[str, str, str]:
"""
Возвращает текущую дату в формате (год, месяц, день) с форматированием
"""
datetime_now = dt.now()
current_day = create_folder(datetime_now.day)
current_month = create_folder(datetime_now.month)
current_year = f"{datetime_now.year}"
return current_year, current_month, current_day

22
utils/logger.py Normal file
View File

@@ -0,0 +1,22 @@
"""
Настройка логгера
"""
import logging
from config import LOG_FILE
def setup_logger(name: str = __name__) -> logging.Logger:
"""
Настройка и возврат логгера
"""
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(name)
return logger
# Глобальный логгер
logger = setup_logger()

View File

@@ -1,26 +1,199 @@
import psycopg2 import psycopg2
from psycopg2.extras import RealDictCursor from psycopg2.extras import RealDictCursor
from pydantic import BaseModel
from typing import List, Optional
from fastapi import HTTPException
# Подключение к БД (укажи свои параметры) # Параметры подключения к БД
conn = psycopg2.connect( DB_CONFIG = {
dbname="parsed_url", "dbname": "parsed_url",
user="postgres", "user": "postgres",
password="qwertyqwerty123123", "password": "qwertyqwerty123123",
host="45.129.78.228", "host": "45.129.78.228",
# host="127.0.0.1" "connect_timeout": 10,
connect_timeout=10, "options": "-c statement_timeout=30000"
options="-c statement_timeout=30000" # таймаут запроса 30 сек }
) # Модель для данных, которые приходят в POST
conn.autocommit = True class ParsedData(BaseModel):
url: str
parsed_at: str
title: str
original_text: str
article_date: str
status: Optional[bool] = False
viewed: Optional[bool] = False
tematik: Optional[bool] = False
svodka: Optional[bool] = False
donesenie: Optional[bool] = False
bilutene: Optional[bool] = False
download: Optional[bool] = False
other: str
category: str
translation_text: str
short_text: str
def close_connection(): # Функции для работы с БД (без эндпоинтов)
"""Закрывает подключение к БД""" def save_parsed_data_to_db(data: ParsedData):
global conn conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
cur.execute("""
INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, tematik, svodka, donesenie, download, bilutene, other, category, translation_text, short_text)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
parsed_at = EXCLUDED.parsed_at,
title = EXCLUDED.title,
original_text = EXCLUDED.original_text,
article_date = EXCLUDED.article_date,
status = EXCLUDED.status,
viewed = EXCLUDED.viewed,
tematik = EXCLUDED.tematik,
download = EXCLUDED.download,
svodka = EXCLUDED.svodka,
donesenie = EXCLUDED.donesenie,
bilutene = EXCLUDED.bilutene,
other = EXCLUDED.other,
category = EXCLUDED.category,
translation_text = EXCLUDED.translation_text,
short_text = EXCLUDED.short_text;
""", (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.tematik, data.svodka, data.donesenie, data.download, data.bilutene, data.other, data.category, data.translation_text, data.short_text))
conn.commit()
return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn: if conn:
conn.close() conn.close()
conn = None
def get_articles_by_filter(field_name: str, start_date: str, finish_date: str):
"""
Возвращает список заголовков статей по полю и диапазону дат для выгрузки
"""
conn = get_connection()
try:
# Проверка валидности поля
allowed_fields = ['tematik', 'svodka', 'donesenie', 'bilutene', 'status']
if field_name not in allowed_fields:
raise ValueError(f"Недопустимое поле: {field_name}. Разрешено: {allowed_fields}")
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(f"""
SELECT title FROM url
WHERE {field_name} = TRUE
AND article_date BETWEEN %s AND %s
ORDER BY article_date DESC;
""", (start_date, finish_date))
rows = cur.fetchall()
return [row['title'] for row in rows]
except Exception as e:
print(f"Ошибка в get_articles_by_filter: {e}")
raise
def get_download_counts():
"""
Возвращает количество статей для каждого поля, где поле = TRUE и download = FALSE
"""
conn = get_connection()
try:
allowed_fields = ['tematik', 'svodka', 'donesenie', 'bilutene']
with conn.cursor(cursor_factory=RealDictCursor) as cur:
counts = {}
for field in allowed_fields:
cur.execute(f"""
SELECT COUNT(*) as count FROM url
WHERE {field} = TRUE
AND download = FALSE;
""")
row = cur.fetchone()
counts[field] = row['count']
return counts
except Exception as e:
print(f"Ошибка в get_download_counts: {e}")
raise
def mark_articles_as_downloaded(titles: list):
"""
Обновляет download = TRUE для списка заголовков
"""
if not titles:
return {"message": "Список заголовков пуст", "updated_rows": 0}
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
UPDATE url
SET download = TRUE
WHERE title = ANY(%s);
""", (titles,))
updated_rows = cur.rowcount
conn.commit()
return {"message": f"Статус download обновлён для {updated_rows} статей", "updated_rows": updated_rows}
except Exception as e:
print(f"Ошибка в mark_articles_as_downloaded: {e}")
raise
# Глобальное подключение к БД
conn = None
def get_connection():
"""Получает подключение к БД, создавая новое при необходимости"""
global conn
try:
# Проверяем, активно ли подключение
if conn is None or conn.closed:
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
# Дополнительная проверка на валидность
elif conn.info.transaction_status == 2: # TRANSACTION_IN_TRANS
# Подключение активно, но в транзакции — закроем и создадим новое
try:
conn.close()
except:
pass
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
return conn
except Exception as e:
print(f"Ошибка при получении подключения: {e}")
# Сбрасываем подключение и пробуем заново
conn = None
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
return conn
def close_connection():
"""Закрывает глобальное подключение к БД"""
global conn
if conn and not conn.closed:
conn.close()
conn = None
# Проверяет, есть ли указанный URL в базе данных.
def check_url_exists(url: str):
conn = get_connection()
try:
with conn.cursor() as cursor:
cursor.execute(
"SELECT 1 FROM url WHERE url = %s LIMIT 1",
(url,)
)
result = cursor.fetchone()
return {"exists": bool(result)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}")
# работа с базой данных показывания задач work_parser
def create_table(): def create_table():
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
@@ -37,11 +210,11 @@ def create_table():
); );
""") """)
print("Таблица work_parser создана или уже существует") print("Таблица work_parser создана или уже существует")
finally: except Exception as e:
if conn: print(f"Ошибка при создании таблицы work_parser: {e}")
conn.close()
def insert_task(status, source_url=None, source_id=None, priority=0): def insert_task(status, source_url=None, source_id=None, priority=0):
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
@@ -51,11 +224,12 @@ def insert_task(status, source_url=None, source_id=None, priority=0):
""", (status, source_url, priority)) """, (status, source_url, priority))
task_id = cur.fetchone()[0] task_id = cur.fetchone()[0]
return task_id return task_id
finally: except Exception as e:
if conn: print(f"Ошибка при создании задачи: {e}")
conn.close() raise
def get_tasks_offset(limit, offset): def get_tasks_offset(limit, offset):
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(""" cur.execute("""
@@ -65,11 +239,12 @@ def get_tasks_offset(limit, offset):
""", (limit, offset)) """, (limit, offset))
tasks = cur.fetchall() tasks = cur.fetchall()
return tasks return tasks
finally: except Exception as e:
if conn: print(f"Ошибка при получении задач: {e}")
conn.close() raise
def delete_task(task_id: int): def delete_task(task_id: int):
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("DELETE FROM work_parser WHERE id = %s RETURNING *;", (task_id,)) cur.execute("DELETE FROM work_parser WHERE id = %s RETURNING *;", (task_id,))
@@ -78,11 +253,12 @@ def delete_task(task_id: int):
return {"message": f"Задача {task_id} удалена", "deleted_task": dict(deleted_task)} return {"message": f"Задача {task_id} удалена", "deleted_task": dict(deleted_task)}
else: else:
return {"message": f"Задача с id {task_id} не найдена"} return {"message": f"Задача с id {task_id} не найдена"}
finally: except Exception as e:
if conn: print(f"Ошибка при удалении задачи: {e}")
conn.close() raise
def update_task(task_id, **fields): def update_task(task_id, **fields):
conn = get_connection()
try: try:
allowed_fields = ['status', 'started_at', 'finished_at', 'source_url', 'error_message', 'attempts', 'priority'] allowed_fields = ['status', 'started_at', 'finished_at', 'source_url', 'error_message', 'attempts', 'priority']
set_parts = [] set_parts = []
@@ -98,74 +274,82 @@ def update_task(task_id, **fields):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(f"UPDATE work_parser SET {set_sql} WHERE id = %s;", values) cur.execute(f"UPDATE work_parser SET {set_sql} WHERE id = %s;", values)
return True return True
finally: except Exception as e:
if conn: print(f"Ошибка при обновлении задачи: {e}")
conn.close() raise
# Создание и работа с таблицей по созданию и редактированию промтов
def create_table_config_gpt(): def create_table_config_gpt():
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
CREATE TABLE IF NOT EXISTS config_gpt ( CREATE TABLE IF NOT EXISTS config_gpt (
url TEXT PRIMARY KEY, name VARCHAR(20) PRIMARY KEY,
name VARCHAR(20),
promt TEXT promt TEXT
); );
""") """)
print("Таблица config_gpt создана или уже существует") print("Таблица config_gpt создана или уже существует")
finally: except Exception as e:
if conn: print(f"Ошибка при создании таблицы config_gpt: {e}")
conn.close()
def update_promt(url: str, name: str, promt: str): def update_promt(name: str, promt: str):
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
INSERT INTO config_gpt (url, name, promt) INSERT INTO config_gpt (name, promt)
VALUES (%s, %s, %s) VALUES ( %s, %s)
ON CONFLICT (url) DO UPDATE SET ON CONFLICT (name) DO UPDATE SET
name = EXCLUDED.name,
promt = EXCLUDED.promt promt = EXCLUDED.promt
""", (url, name, promt)) """, (name, promt))
conn.commit() conn.commit()
finally: except Exception as e:
if conn: print(f"Ошибка при обновлении промта: {e}")
conn.close() raise
def get_promt(promt_name_url): def get_promt(promt_name_url):
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT promt FROM config_gpt WHERE url = %s", (promt_name_url,)) cur.execute("SELECT promt FROM config_gpt WHERE name = %s", (promt_name_url,))
promt = cur.fetchone() promt = cur.fetchone()
if promt:
return promt['promt'] return promt['promt']
finally: return None
if conn: except Exception as e:
conn.close() print(f"Ошибка при получении промта: {e}")
raise
def get_all_promt(): def get_all_promt():
# Возвращает список всех значений поля name из таблицы config_gpt
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT * FROM config_gpt") cur.execute("SELECT * FROM config_gpt")
rows = cur.fetchall() rows = cur.fetchall()
sources = [{"url": row["url"], "name": row["name"], "promt": row["promt"]} for row in rows] sources = [{"name": row["name"], "promt": row["promt"]} for row in rows]
return {"sources": sources} return {"sources": sources}
finally: except Exception as e:
if conn: print(f"Ошибка при получении всех промтов: {e}")
conn.close() raise
def get_all_categories_promt(): def get_all_categories_promt():
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT name FROM config_gpt") cur.execute("SELECT name FROM config_gpt")
rows = cur.fetchall() rows = cur.fetchall()
return [row["name"] for row in rows] return [row["name"] for row in rows]
finally: except Exception as e:
if conn: print(f"Ошибка при получении категорий: {e}")
conn.close() raise
# Создание, сохранение и работа с таблицей ошибочных ссылок (error_url)
def create_table_error_url(): def create_table_error_url():
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
@@ -176,11 +360,11 @@ def create_table_error_url():
); );
""") """)
print("Таблица error_url создана или уже существует") print("Таблица error_url создана или уже существует")
finally: except Exception as e:
if conn: print(f"Ошибка при создании таблицы error_url: {e}")
conn.close()
def add_error_url(source_url: str, error_sources_url: str): def add_error_url(source_url: str, error_sources_url: str):
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
@@ -189,11 +373,12 @@ def add_error_url(source_url: str, error_sources_url: str):
RETURNING id; RETURNING id;
""", (source_url, error_sources_url)) """, (source_url, error_sources_url))
return cur.fetchone()[0] return cur.fetchone()[0]
finally: except Exception as e:
if conn: print(f"Ошибка при добавлении error_url: {e}")
conn.close() raise
def check_error_url(error_sources_url: str) -> bool: def check_error_url(error_sources_url: str) -> bool:
conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(""" cur.execute("""
@@ -204,37 +389,113 @@ def check_error_url(error_sources_url: str) -> bool:
row = cur.fetchone() row = cur.fetchone()
return row is None return row is None
finally: except Exception as e:
if conn: print(f"Ошибка при проверке error_url: {e}")
conn.close() return True
# Создание и работа с таблицей источников sources
def create_table_add_sourse(): def create_table_add_sourse():
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
CREATE TABLE IF NOT EXISTS sourse ( CREATE TABLE IF NOT EXISTS sourse (
url TEXT PRIMARY KEY, url TEXT PRIMARY KEY,
promt TEXT promt TEXT,
status BOOLEAN DEFAULT FALSE
); );
""") """)
print("Таблица sourse создана или уже существует") print("Таблица sourse создана или уже существует")
finally: except Exception as e:
if conn: print(f"Ошибка при создании таблицы sourse: {e}")
conn.close()
def add_sources(url: str, promt: str): def add_sources(url: str, promt: str, status: bool = False):
conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
INSERT INTO config_gpt (url, promt) INSERT INTO sourse (url, promt, status)
VALUES (%s, %s) VALUES (%s, %s, %s)
ON CONFLICT (url) DO UPDATE SET ON CONFLICT (url) DO UPDATE SET
promt = EXCLUDED.promt promt = EXCLUDED.promt,
""", (url, promt)) status = EXCLUDED.status
""", (url, promt, status))
conn.commit() conn.commit()
finally: except Exception as e:
if conn: print(f"Ошибка при добавлении источника: {e}")
conn.close() raise
def get_all_sources(category: str):
"""Возвращает все записи из таблицы sourse. Сначала показываются записи со status=false"""
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
if category == "all":
cur.execute("""
SELECT * FROM sourse
ORDER BY status ASC, url ASC
""")
else:
cur.execute("""
SELECT * FROM sourse
WHERE promt = %s
ORDER BY status ASC, url ASC
""", (category,))
rows = cur.fetchall()
sources = [{"url": row["url"], "promt": row["promt"], "status": row["status"]} for row in rows]
return {"sources": sources}
except Exception as e:
print(f"Ошибка при получении источников: {e}")
return {"error": str(e), "sources": []}
def get_true_sources():
"""Возвращает все записи из таблицы sourse. Сначала показываются записи со status=true"""
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM sourse
WHERE status = false
""")
rows = cur.fetchall()
sources = {}
for row in rows:
sources.update({row["url"]: row["promt"]})
return sources
except Exception as e:
print(f"Ошибка при получении источников: {e}")
return {"error": str(e), "sources": []}
def update_source_status(url: str, status: bool = True):
"""Обновляет статус источника по URL"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
UPDATE sourse SET status = %s WHERE url = %s
""", (status, url))
updated = cur.rowcount
conn.commit()
return {"message": f"Статус обновлён для {url}", "updated_rows": updated}
except Exception as e:
print(f"Ошибка при обновлении статуса: {e}")
return {"error": str(e), "updated_rows": 0}
def delete_sources(url: str):
"""Удаляет источник по URL из таблицы sourse"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("DELETE FROM sourse WHERE url=%s RETURNING *", (url,))
deleted_task = cur.fetchone()
conn.commit()
if deleted_task:
return {"message": f"Источник {url} удалён", "deleted": True}
else:
return {"message": f"Источник с url {url} не найден", "deleted": False}
except Exception as e:
print(f"Ошибка при удалении источника: {e}")
return {"error": str(e), "deleted": False}
# Пример использования # Пример использования
# if __name__ == "__main__": # if __name__ == "__main__":
@@ -247,3 +508,4 @@ def add_sources(url: str, promt: str):
# # print(get_promt("japan")) # # print(get_promt("japan"))
# # create_table_error_url() # # create_table_error_url()
# create_table_add_sourse() # create_table_add_sourse()
# delete_sources("https://www.taipeitimes.com/")