Compare commits

..

27 Commits

Author SHA1 Message Date
7f5e9df751 CORS
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-31 16:11:07 +10:00
3266e79e54 ""
All checks were successful
continuous-integration/drone Build is passing
2026-05-20 22:28:33 +10:00
55710cc961 старт коректирую
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-20 20:39:25 +10:00
63ab6ed108 добавление отслеживания количества выгрузки
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-19 22:36:49 +10:00
e9d7a2d51f правка
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-16 13:04:03 +10:00
770445feaa добавил атостарт всех сайтов
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-16 12:41:54 +10:00
656c8d9d9a исправление в выгрузке за пеиод прибавляющее один день к финишу, это связано с особенностями sql запроса
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 14:35:42 +10:00
dd1c36b9de убрал ограничение по сохранению док файла
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 12:17:16 +10:00
c9abd97efa подредактирована стабильная работа с БД
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:54:05 +10:00
a841e1f58a дебаг
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:07:46 +10:00
e5a93e307a long vigryzka 100
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 11:05:07 +10:00
74143fd369 url fiks
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-08 10:53:18 +10:00
05cd85d8b5 длина сохранения 100
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 21:25:52 +10:00
707c523b53 abrc cj[hfytybz
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 20:59:25 +10:00
83020c3124 исправляю ошибки
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-07 18:58:18 +10:00
d0f0ea60a8 abs_url
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 22:14:12 +10:00
c7f0ad856a rjhtrrn
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:53:14 +10:00
94b608cd6d abrc
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:50:34 +10:00
ef453f661a добавил статус
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 21:24:45 +10:00
46350c1c09 ремув баг
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 20:22:12 +10:00
3e2820c8ce rer
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 19:44:41 +10:00
b688336161 rebut bag
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-05 19:29:47 +10:00
d55e96978e retest
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-03 13:56:13 +10:00
0577f4d65c test
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-03 13:48:49 +10:00
8f86c51d19 расширение выгрузки
All checks were successful
continuous-integration/drone/push Build is passing
Co-authored-by: Copilot <copilot@github.com>
2026-05-01 23:33:40 +10:00
994479fd9d ljltkfk
All checks were successful
continuous-integration/drone/push Build is passing
2026-05-01 22:23:04 +10:00
bb35de53d4 убрал parser_bd 2026-05-01 20:41:55 +10:00
10 changed files with 320 additions and 446 deletions

View File

@@ -12,10 +12,9 @@ from fastapi.responses import FileResponse
from config import DOCUMENTS_DIR, APP_TITLE, APP_DESCRIPTION, APP_VERSION from config import DOCUMENTS_DIR, APP_TITLE, APP_DESCRIPTION, APP_VERSION
from utils import logger from utils import logger
from api.schemas import ParserOneRequest, Parserall, Source, DownloadRange from api.schemas import ParserOneRequest, Parserall, Source, DownloadRange, DownloadCountsResponse
from parsers import start_pars_one_istochnik, start_pars_two_istochnik, start_pars_all_istochnik from parsers import start_pars_one_istochnik, start_pars_two_istochnik, start_pars_all_istochnik
import work_parser as wp import work_parser as wp
import parser_bd as pbd
def setup_routes(app: FastAPI) -> None: def setup_routes(app: FastAPI) -> None:
@@ -26,7 +25,7 @@ def setup_routes(app: FastAPI) -> None:
# CORS middleware # CORS middleware
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_origins=["*"], allow_origins=["http://localhost:5173", "https://allowlgroup.ru"],
allow_credentials=True, allow_credentials=True,
allow_methods=["*"], allow_methods=["*"],
allow_headers=["*"], allow_headers=["*"],
@@ -51,8 +50,8 @@ def setup_routes(app: FastAPI) -> None:
return {"status": "success", "message": "Источник добавлен", "data": result} return {"status": "success", "message": "Источник добавлен", "data": result}
@app.get("/all_sources", summary="Метод получения всех источников") @app.get("/all_sources", summary="Метод получения всех источников")
async def get_all_sources(): async def get_all_sources(category: str = "all"):
return wp.get_all_sources() return wp.get_all_sources(category)
@app.delete("/delete_sources", summary="Метод удаления источника") @app.delete("/delete_sources", summary="Метод удаления источника")
async def delete_sources(url: str): async def delete_sources(url: str):
@@ -113,36 +112,54 @@ def setup_routes(app: FastAPI) -> None:
@app.post("/download_all", summary="Скачать все файлы за период") @app.post("/download_all", summary="Скачать все файлы за период")
async def download_all(dates: DownloadRange, background_tasks: BackgroundTasks): async def download_all(dates: DownloadRange, background_tasks: BackgroundTasks):
date_start = dates.data_start date_start_str = dates.data_start
date_finish = dates.data_finish date_finish_str = dates.data_finish
field_name = getattr(dates, 'field_name', 'status') # Поле для фильтрации (по умолчанию 'status')
try: try:
start_date = datetime.strptime(date_start, "%Y-%m-%d") start_date = datetime.strptime(date_start_str, "%Y-%m-%d")
finish_date = datetime.strptime(date_finish, "%Y-%m-%d") finish_date = datetime.strptime(date_finish_str, "%Y-%m-%d") + timedelta(days=1)
except ValueError: except ValueError:
return {"error": "Неверный формат даты. Используйте YYYY-MM-DD"} return {"error": "Неверный формат даты. Используйте YYYY-MM-DD"}
if start_date > finish_date: if start_date > finish_date:
return {"error": "Дата начала не может быть позже даты окончания"} return {"error": "Дата начала не может быть позже даты окончания"}
all_files = [] # 1. Получаем список заголовков из БД
start_date_str = start_date.strftime("%Y-%m-%d")
finish_date_str = finish_date.strftime("%Y-%m-%d")
try:
titles_from_db = wp.get_articles_by_filter(field_name, start_date_str, finish_date_str)
except Exception as e:
return {"error": f"Ошибка при получении данных из БД: {e}"}
if not titles_from_db:
return {"error": "Нет статей с выбранным фильтром за указанный период", "field_name": field_name}
# 2. Собираем все файлы .docx за период
all_files = []
current_date = start_date current_date = start_date
while current_date <= finish_date: while current_date <= finish_date :
date_path = current_date.strftime("%Y/%m/%d") date_path = current_date.strftime("%Y/%m/%d")
full_dir_path = os.path.join(DOCUMENTS_DIR, date_path) full_dir_path = os.path.join(DOCUMENTS_DIR, date_path)
if os.path.exists(full_dir_path): if os.path.exists(full_dir_path):
for file in os.listdir(full_dir_path): for file in os.listdir(full_dir_path):
if file.endswith('.docx'): if file.endswith('.docx'):
all_files.append(os.path.join(full_dir_path, file)) file_title = file[:-5] # убираем расширение .docx
if file_title in titles_from_db:
all_files.append(os.path.join(full_dir_path, file))
current_date += timedelta(days=1) current_date += timedelta(days=1)
if not all_files: if not all_files:
return {"error": "Файлы не найдены за указанный период", "date_start": date_start, "date_finish": date_finish} return {"error": "Файлы не найдены за указанный период",
"date_start": date_start_str,
"date_finish": date_finish_str,
"titles_found": len(titles_from_db)}
archive_name = f"documents_{date_start}_{date_finish}.zip" archive_name = f"documents_{date_start_str}_{date_finish_str}.zip"
archive_path = os.path.join(DOCUMENTS_DIR, archive_name) archive_path = os.path.join(DOCUMENTS_DIR, archive_name)
try: try:
@@ -161,6 +178,13 @@ def setup_routes(app: FastAPI) -> None:
except Exception as e: except Exception as e:
logger.warning(f"Не удалось удалить архив: {e}") logger.warning(f"Не удалось удалить архив: {e}")
def mark_as_downloaded():
try:
wp.mark_articles_as_downloaded(titles_from_db)
logger.info(f"Статьи помечены как скачанные: {len(titles_from_db)} записей")
except Exception as e:
logger.error(f"Ошибка при обновлении download: {e}")
response = FileResponse( response = FileResponse(
path=archive_path, path=archive_path,
filename=archive_name, filename=archive_name,
@@ -172,87 +196,29 @@ def setup_routes(app: FastAPI) -> None:
response.headers["Access-Control-Expose-Headers"] = "Content-Disposition" response.headers["Access-Control-Expose-Headers"] = "Content-Disposition"
background_tasks.add_task(cleanup_archive) background_tasks.add_task(cleanup_archive)
background_tasks.add_task(mark_as_downloaded)
return response return response
# ==================== Выгрузка (download) ====================
@app.get("/download_counts", summary="Получить количество статей для выгрузки", response_model=DownloadCountsResponse)
async def get_download_counts():
"""
Возвращает количество статей для каждого поля (tematik, svodka, donesenie, bilutene, status),
где значение поля = TRUE и download = FALSE
"""
return wp.get_download_counts()
# @app.post("/mark_downloaded", summary="Отметить статьи как скачанные")
# async def mark_articles_as_downloaded(titles: List[str]):
# """
# Обновляет поле download = TRUE для списка заголовков статей
# """
# return wp.mark_articles_as_downloaded(titles)
@app.get("/logs", summary="Показать логи") @app.get("/logs", summary="Показать логи")
async def get_logs(): async def get_logs():
with open("app.log", "r") as file: with open("app.log", "r") as file:
lines = file.readlines()[-10:] lines = file.readlines()[-10:]
return {"logs": lines} return {"logs": lines}
# ==================== Эндпоинты из parser_bd.py ====================
@app.post("/save_parsed_data", summary="Сохранить данные парсинга")
def save_parsed_data(data: pbd.ParsedData):
try:
pbd.save_parsed_data_to_db(data)
return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
@app.post("/update_viewed_status", summary="Обновляет поле viewed")
def update_viewed_status(url: str, viewed: bool):
try:
result = pbd.update_viewed_status_in_db(url, viewed)
if not result.get("found"):
raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена")
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
return {"status": "success", "message": "Статус просмотра успешно обновлен"}
@app.post("/update_status_status", summary="Обновляет поле status")
def update_status_status(url: str, status: bool):
try:
result = pbd.update_status_status_in_db(url, status)
if not result.get("found"):
raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена")
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
return {"status": "success", "message": "Статус просмотра успешно обновлен"}
@app.get("/check_url_exists", summary="Проверяет url")
def check_url_exists(url: str):
try:
return pbd.check_url_exists_in_db(url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}")
@app.get("/records", summary="Получить записи из БД с пагинацией", response_model=List[pbd.ParsedData])
def get_records(offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)):
try:
return pbd.get_records_from_db(offset, limit)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}")
@app.get("/records_all/count", summary="Получить общее количество записей")
def get_records_count(item: str = Query("default")):
try:
return pbd.get_records_count_from_db(item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}")
@app.get("/poisk/count", summary="Получить количество результатов поиска")
def get_poisk_count(query: str, item: str = Query("default")):
try:
return pbd.get_poisk_count_from_db(query, item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}")
@app.get("/poisk", summary="Поиск с пагинацией")
def poisk(query: str, offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100), item: str = Query("default")):
try:
return pbd.poisk_from_db(query, offset, limit, item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при поиске: {e}")
@app.get("/records_all", summary="Получить все записи из БД + сортирует + пагинация", response_model=List[pbd.ParsedData])
def get_records_all(item: str = Query("default"), offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)):
try:
return pbd.get_records_all_from_db(item, offset, limit)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}")

View File

@@ -31,3 +31,12 @@ class DownloadRange(BaseModel):
"""Диапазон дат для скачивания файлов""" """Диапазон дат для скачивания файлов"""
data_start: str data_start: str
data_finish: str data_finish: str
field_name: str = "status"
class DownloadCountsResponse(BaseModel):
"""Ответ с количеством статей для выгрузки по каждому полю"""
tematik: int
svodka: int
donesenie: int
bilutene: int

27
main.py
View File

@@ -4,6 +4,7 @@ Parser API - Точка входа приложения
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from fastapi import FastAPI from fastapi import FastAPI
import uvicorn import uvicorn
from work_parser import get_true_sources
from config import ( from config import (
APP_TITLE, APP_TITLE,
@@ -16,10 +17,11 @@ from config import (
SCHEDULED_PARSER_2_MINUTE SCHEDULED_PARSER_2_MINUTE
) )
from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.schedulers.asyncio import AsyncIOScheduler
from parsers import scheduled_parser_1, scheduled_parser_2 from parsers import scheduled_parser_1, scheduled_parser_2, scheduled_parser_universal
from api import setup_routes from api import setup_routes
# Инициализация планировщика # Инициализация планировщика
scheduler = AsyncIOScheduler() scheduler = AsyncIOScheduler()
@@ -30,12 +32,35 @@ async def lifespan(app: FastAPI):
# Startup # Startup
scheduler.add_job(scheduled_parser_1, "cron", hour=SCHEDULED_PARSER_1_HOUR, minute=SCHEDULED_PARSER_1_MINUTE) scheduler.add_job(scheduled_parser_1, "cron", hour=SCHEDULED_PARSER_1_HOUR, minute=SCHEDULED_PARSER_1_MINUTE)
scheduler.add_job(scheduled_parser_2, "cron", hour=SCHEDULED_PARSER_2_HOUR, minute=SCHEDULED_PARSER_2_MINUTE) scheduler.add_job(scheduled_parser_2, "cron", hour=SCHEDULED_PARSER_2_HOUR, minute=SCHEDULED_PARSER_2_MINUTE)
# Получаем все источники и распределяем их равномерно по 24 часам
sources = get_true_sources().items()
num_sources = len(sources)
if num_sources > 0:
total_minutes_per_day = 24 * 60
minutes_per_source = total_minutes_per_day / num_sources
for idx, (url, promt) in enumerate(sources):
total_minutes = int(idx * minutes_per_source)
scheduled_hour = total_minutes // 60
scheduled_minute = total_minutes % 60
# Для универсального парсера нужно передавать url и promt как аргументы
scheduler.add_job(
scheduled_parser_universal,
"cron",
hour=scheduled_hour,
minute=scheduled_minute,
args=[url, promt]
)
scheduler.start() scheduler.start()
yield yield
# Shutdown # Shutdown
scheduler.shutdown() scheduler.shutdown()
# Создание приложения FastAPI # Создание приложения FastAPI
app = FastAPI( app = FastAPI(
title=APP_TITLE, title=APP_TITLE,

View File

@@ -1,285 +0,0 @@
from pydantic import BaseModel
from typing import List, Optional
import psycopg2
import psycopg2.extras
from fastapi import HTTPException
# Модель для данных, которые приходят в POST
class ParsedData(BaseModel):
url: str
parsed_at: str
title: str
original_text: str
article_date: str
status: Optional[bool] = False
viewed: Optional[bool] = False
tematik: Optional[bool] = False
svodka: Optional[bool] = False
donesenie: Optional[bool] = False
bilutene: Optional[bool] = False
other: str
category: str
translation_text: str
short_text: str
# Подключение к БД
def get_connection():
return psycopg2.connect(
dbname="parsed_url",
user="postgres",
password="qwertyqwerty123123",
host="45.129.78.228"
)
# Функции для работы с БД (без эндпоинтов)
def save_parsed_data_to_db(data: ParsedData):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
cur.execute("""
INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, tematik, svodka, donesenie, bilutene, other, category, translation_text, short_text)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
parsed_at = EXCLUDED.parsed_at,
title = EXCLUDED.title,
original_text = EXCLUDED.original_text,
article_date = EXCLUDED.article_date,
status = EXCLUDED.status,
viewed = EXCLUDED.viewed,
tematik = EXCLUDED.tematik,
svodka = EXCLUDED.svodka,
donesenie = EXCLUDED.donesenie,
bilutene = EXCLUDED.bilutene,
other = EXCLUDED.other,
category = EXCLUDED.category,
translation_text = EXCLUDED.translation_text,
short_text = EXCLUDED.short_text;
""", (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.tematik, data.svodka, data.donesenie, data.bilutene, data.other, data.category, data.translation_text, data.short_text))
conn.commit()
return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def update_viewed_status_in_db(url: str, viewed: bool):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"""
UPDATE url
SET viewed = %s
WHERE url = %s
""",
(viewed, url)
)
if cursor.rowcount == 0:
return {"found": False}
conn.commit()
return {"found": True}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def update_status_status_in_db(url: str, status: bool):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"""
UPDATE url
SET status = %s
WHERE url = %s
""",
(status, url)
)
if cursor.rowcount == 0:
return {"found": False}
conn.commit()
return {"found": True}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def check_url_exists_in_db(url: str):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"SELECT 1 FROM url WHERE url = %s LIMIT 1",
(url,)
)
result = cursor.fetchone()
return {"exists": bool(result)}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_from_db(offset: int = 0, limit: int = 10):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT url, parsed_at, title, original_text, article_date, status, viewed, other, category, translation_text, short_text
FROM url
ORDER BY parsed_at DESC NULLS LAST
OFFSET %s LIMIT %s
""", (offset, limit))
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_count_from_db(item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
if item == "viewed":
cur.execute("SELECT COUNT(*) FROM url WHERE viewed = true")
elif item == "status":
cur.execute("SELECT COUNT(*) FROM url WHERE status = true")
elif item == "time":
cur.execute("SELECT COUNT(*) FROM url")
else:
cur.execute("SELECT COUNT(*) FROM url")
result = cur.fetchone()
return {"count": result[0]}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_poisk_count_from_db(query: str, item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
search_pattern = f"%{query}%"
base_query = """SELECT COUNT(*) FROM url WHERE (
title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s
OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s
)"""
params = [search_pattern] * 7
if item == "viewed":
base_query += " AND viewed = true"
elif item == "status":
base_query += " AND status = true"
cur.execute(base_query, params)
result = cur.fetchone()
return {"count": result[0]}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def poisk_from_db(query: str, offset: int = 0, limit: int = 10, item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
search_pattern = f"%{query}%"
base_query = """SELECT * FROM url WHERE (
title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s
OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s
)"""
params = [search_pattern] * 7
if item == "viewed":
base_query += " AND viewed = true"
elif item == "status":
base_query += " AND status = true"
base_query += " ORDER BY article_date DESC OFFSET %s LIMIT %s"
params.extend([offset, limit])
cur.execute(base_query, params)
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_all_from_db(item: str = "default", offset: int = 0, limit: int = 10):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if item == "time":
cur.execute("""
SELECT * FROM url
ORDER BY viewed ASC, parsed_at DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "viewed":
cur.execute("""
SELECT * FROM url
WHERE viewed = true
ORDER BY article_date DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "status":
cur.execute("""
SELECT * FROM url
WHERE status = true
ORDER BY parsed_at DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "Япония" or item == "Корея":
cur.execute("""
SELECT * FROM url
WHERE other = %s
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", (item, offset, limit))
elif item == "Китай":
cur.execute("""
SELECT * FROM url
WHERE other IN (%s, %s, %s)
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", ("source1", "source2", "Китай", offset, limit))
else:
cur.execute("""
SELECT * FROM url
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", (offset, limit))
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()

View File

@@ -4,7 +4,7 @@
from .base import BaseParser from .base import BaseParser
from .source1 import Source1Parser, start_pars_one_istochnik, scheduled_parser_1 from .source1 import Source1Parser, start_pars_one_istochnik, scheduled_parser_1
from .source2 import Source2Parser, start_pars_two_istochnik, scheduled_parser_2 from .source2 import Source2Parser, start_pars_two_istochnik, scheduled_parser_2
from .universal import UniversalParser, start_pars_all_istochnik from .universal import UniversalParser, start_pars_all_istochnik, scheduled_parser_universal
__all__ = [ __all__ = [
'BaseParser', 'BaseParser',
@@ -15,5 +15,6 @@ __all__ = [
'start_pars_two_istochnik', 'start_pars_two_istochnik',
'scheduled_parser_2', 'scheduled_parser_2',
'UniversalParser', 'UniversalParser',
'start_pars_all_istochnik' 'start_pars_all_istochnik',
'scheduled_parser_universal'
] ]

View File

@@ -158,4 +158,4 @@ def scheduled_parser_1() -> None:
""" """
Функция для автоматического запуска по расписанию Функция для автоматического запуска по расписанию
""" """
start_pars_one_istochnik() start_pars_one_istochnik("")

View File

@@ -130,7 +130,7 @@ class Source2Parser(BaseParser):
text, time_text = extract_text_from_url(hrefs) text, time_text = extract_text_from_url(hrefs)
if len(text) >= MIN_ARTICLE_TEXT_LENGTH: if len(text) >= MIN_ARTICLE_TEXT_LENGTH:
response_text = gpt_response_message(text, "source2") response_text = gpt_response_message(text, "source2")
print(response_text) # print(response_text)
if response_text: if response_text:
update_bd_and_create_document( update_bd_and_create_document(
response_text=response_text, response_text=response_text,

View File

@@ -46,7 +46,7 @@ class UniversalParser(BaseParser):
""" """
print(f"Начало парсинга: {self.url} с промтом: {self.promt}") print(f"Начало парсинга: {self.url} с промтом: {self.promt}")
self.start_task(self.url) self.start_task(self.url)
num = 0
try: try:
response = requests.get(self.url) response = requests.get(self.url)
# print(response.text) # print(response.text)
@@ -74,7 +74,7 @@ class UniversalParser(BaseParser):
if parsed.netloc != base_domain: if parsed.netloc != base_domain:
continue continue
# print(num)
if not check_url(abs_url) and wp.check_error_url(abs_url): if not check_url(abs_url) and wp.check_error_url(abs_url):
try: try:
@@ -85,9 +85,11 @@ class UniversalParser(BaseParser):
print("URL:", abs_url) print("URL:", abs_url)
if len(article.text) > 200 and article.publish_date: if len(article.text) > 200 and article.publish_date:
num += 1
# Если дата публикации отсутствует - используем текущую # Если дата публикации отсутствует - используем текущую
if article.publish_date: if article.publish_date:
time_text = article.publish_date.strftime("%Y/%m/%d %H:%M:%S") time_text = article.publish_date.strftime("%Y/%m/%d %H:%M:%S")
print(time_text)
else: else:
time_text = datetime.now().strftime("%Y/%m/%d %H:%M:%S") time_text = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
print(f"Дата публикации отсутствует, используем текущую: {time_text}") print(f"Дата публикации отсутствует, используем текущую: {time_text}")
@@ -109,7 +111,8 @@ class UniversalParser(BaseParser):
print(f"Ошибка при обработке статьи {abs_url}: {e}") print(f"Ошибка при обработке статьи {abs_url}: {e}")
logger.info(f"Ошибка при обработке статьи {abs_url}: {e}") logger.info(f"Ошибка при обработке статьи {abs_url}: {e}")
continue continue
if num == 0:
wp.update_source_status(self.url)
self.complete_task() self.complete_task()
@@ -119,3 +122,10 @@ def start_pars_all_istochnik(url: str, promt: str) -> None:
""" """
parser = UniversalParser(url, promt) parser = UniversalParser(url, promt)
parser.parse() parser.parse()
def scheduled_parser_universal(url: str, promt: str) -> None:
"""
Функция для автоматического запуска универсального парсера по расписанию
"""
start_pars_all_istochnik(url, promt)

View File

@@ -35,14 +35,6 @@ def update_bd_and_create_document(
clean_response = response_text.strip().replace('```json', '').replace('```', '').strip() clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data = json.loads(clean_response) data = json.loads(clean_response)
# Проверяем наличие обязательных полей от GPT
# missing_fields = [f for f in REQUIRED_FIELDS if not data.get(f)]
# if missing_fields:
# print(f"Ответ GPT не содержит обязательных полей: {missing_fields} для URL: {url}")
# logger.warning(f"Ответ GPT не содержит обязательных полей: {missing_fields} для URL: {url}")
# print(f"Полученные данные: {data}")
# return
# Нормализация типов: category может приходить как list, а ожидается str # Нормализация типов: category может приходить как list, а ожидается str
if isinstance(data.get('category'), list): if isinstance(data.get('category'), list):
data['category'] = ', '.join(data['category']) data['category'] = ', '.join(data['category'])
@@ -60,13 +52,15 @@ def update_bd_and_create_document(
data['donesenie'] = False data['donesenie'] = False
data['bilutene'] = False data['bilutene'] = False
data['other'] = other data['other'] = other
data['download'] = False
# Сохранение в БД через pbd # Сохранение в БД через pbd
parsed_data = wp.ParsedData(**data) parsed_data = wp.ParsedData(**data)
wp.save_parsed_data_to_db(parsed_data) wp.save_parsed_data_to_db(parsed_data)
print("Данные успешно сохранены в БД") print("Данные успешно сохранены в БД")
# Создание DOCX документа # Создание DOCX документа
# path_day = parsed_at.split()[0].replace('-', '/')
path_day = article_date.split()[0] path_day = article_date.split()[0]
documents_path = os.path.join(DOCUMENTS_DIR, path_day) documents_path = os.path.join(DOCUMENTS_DIR, path_day)
if not os.path.exists(documents_path): if not os.path.exists(documents_path):
@@ -75,7 +69,7 @@ def update_bd_and_create_document(
doc = Document() doc = Document()
doc.add_heading('Ссылка на статью', level=1) doc.add_heading('Ссылка на статью', level=1)
doc.add_paragraph(other) doc.add_paragraph(url)
doc.add_heading('Дата и время', level=1) doc.add_heading('Дата и время', level=1)
doc.add_paragraph(article_date) doc.add_paragraph(article_date)
doc.add_heading('Обнаруженные тематики текста', level=1) doc.add_heading('Обнаруженные тематики текста', level=1)

View File

@@ -26,6 +26,7 @@ class ParsedData(BaseModel):
svodka: Optional[bool] = False svodka: Optional[bool] = False
donesenie: Optional[bool] = False donesenie: Optional[bool] = False
bilutene: Optional[bool] = False bilutene: Optional[bool] = False
download: Optional[bool] = False
other: str other: str
category: str category: str
translation_text: str translation_text: str
@@ -38,8 +39,8 @@ def save_parsed_data_to_db(data: ParsedData):
conn = get_connection() conn = get_connection()
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, tematik, svodka, donesenie, bilutene, other, category, translation_text, short_text) INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, tematik, svodka, donesenie, download, bilutene, other, category, translation_text, short_text)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET ON CONFLICT (url) DO UPDATE SET
parsed_at = EXCLUDED.parsed_at, parsed_at = EXCLUDED.parsed_at,
title = EXCLUDED.title, title = EXCLUDED.title,
@@ -48,6 +49,7 @@ def save_parsed_data_to_db(data: ParsedData):
status = EXCLUDED.status, status = EXCLUDED.status,
viewed = EXCLUDED.viewed, viewed = EXCLUDED.viewed,
tematik = EXCLUDED.tematik, tematik = EXCLUDED.tematik,
download = EXCLUDED.download,
svodka = EXCLUDED.svodka, svodka = EXCLUDED.svodka,
donesenie = EXCLUDED.donesenie, donesenie = EXCLUDED.donesenie,
bilutene = EXCLUDED.bilutene, bilutene = EXCLUDED.bilutene,
@@ -55,7 +57,7 @@ def save_parsed_data_to_db(data: ParsedData):
category = EXCLUDED.category, category = EXCLUDED.category,
translation_text = EXCLUDED.translation_text, translation_text = EXCLUDED.translation_text,
short_text = EXCLUDED.short_text; short_text = EXCLUDED.short_text;
""", (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.tematik, data.svodka, data.donesenie, data.bilutene, data.other, data.category, data.translation_text, data.short_text)) """, (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.tematik, data.svodka, data.donesenie, data.download, data.bilutene, data.other, data.category, data.translation_text, data.short_text))
conn.commit() conn.commit()
return {"status": "success", "message": "Данные успешно сохранены"} return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e: except Exception as e:
@@ -66,6 +68,79 @@ def save_parsed_data_to_db(data: ParsedData):
if conn: if conn:
conn.close() conn.close()
def get_articles_by_filter(field_name: str, start_date: str, finish_date: str):
"""
Возвращает список заголовков статей по полю и диапазону дат для выгрузки
"""
conn = get_connection()
try:
# Проверка валидности поля
allowed_fields = ['tematik', 'svodka', 'donesenie', 'bilutene', 'status']
if field_name not in allowed_fields:
raise ValueError(f"Недопустимое поле: {field_name}. Разрешено: {allowed_fields}")
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(f"""
SELECT title FROM url
WHERE {field_name} = TRUE
AND article_date BETWEEN %s AND %s
ORDER BY article_date DESC;
""", (start_date, finish_date))
rows = cur.fetchall()
return [row['title'] for row in rows]
except Exception as e:
print(f"Ошибка в get_articles_by_filter: {e}")
raise
def get_download_counts():
"""
Возвращает количество статей для каждого поля, где поле = TRUE и download = FALSE
"""
conn = get_connection()
try:
allowed_fields = ['tematik', 'svodka', 'donesenie', 'bilutene']
with conn.cursor(cursor_factory=RealDictCursor) as cur:
counts = {}
for field in allowed_fields:
cur.execute(f"""
SELECT COUNT(*) as count FROM url
WHERE {field} = TRUE
AND download = FALSE;
""")
row = cur.fetchone()
counts[field] = row['count']
return counts
except Exception as e:
print(f"Ошибка в get_download_counts: {e}")
raise
def mark_articles_as_downloaded(titles: list):
"""
Обновляет download = TRUE для списка заголовков
"""
if not titles:
return {"message": "Список заголовков пуст", "updated_rows": 0}
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
UPDATE url
SET download = TRUE
WHERE title = ANY(%s);
""", (titles,))
updated_rows = cur.rowcount
conn.commit()
return {"message": f"Статус download обновлён для {updated_rows} статей", "updated_rows": updated_rows}
except Exception as e:
print(f"Ошибка в mark_articles_as_downloaded: {e}")
raise
# Глобальное подключение к БД # Глобальное подключение к БД
conn = None conn = None
@@ -73,15 +148,39 @@ conn = None
def get_connection(): def get_connection():
"""Получает подключение к БД, создавая новое при необходимости""" """Получает подключение к БД, создавая новое при необходимости"""
global conn global conn
if conn is None or conn.closed: try:
# Проверяем, активно ли подключение
if conn is None or conn.closed:
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
# Дополнительная проверка на валидность
elif conn.info.transaction_status == 2: # TRANSACTION_IN_TRANS
# Подключение активно, но в транзакции — закроем и создадим новое
try:
conn.close()
except:
pass
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
return conn
except Exception as e:
print(f"Ошибка при получении подключения: {e}")
# Сбрасываем подключение и пробуем заново
conn = None
conn = psycopg2.connect(**DB_CONFIG) conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True conn.autocommit = True
return conn return conn
def close_connection():
"""Закрывает глобальное подключение к БД"""
global conn
if conn and not conn.closed:
conn.close()
conn = None
# Проверяет, есть ли указанный URL в базе данных. # Проверяет, есть ли указанный URL в базе данных.
def check_url_exists(url: str): def check_url_exists(url: str):
conn = get_connection() conn = get_connection()
try: try:
conn = get_connection()
with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute( cursor.execute(
"SELECT 1 FROM url WHERE url = %s LIMIT 1", "SELECT 1 FROM url WHERE url = %s LIMIT 1",
@@ -91,8 +190,6 @@ def check_url_exists(url: str):
return {"exists": bool(result)} return {"exists": bool(result)}
except Exception as e: except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}") raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}")
finally:
pass
# работа с базой данных показывания задач work_parser # работа с базой данных показывания задач work_parser
def create_table(): def create_table():
@@ -113,8 +210,8 @@ def create_table():
); );
""") """)
print("Таблица work_parser создана или уже существует") print("Таблица work_parser создана или уже существует")
finally: except Exception as e:
pass print(f"Ошибка при создании таблицы work_parser: {e}")
def insert_task(status, source_url=None, source_id=None, priority=0): def insert_task(status, source_url=None, source_id=None, priority=0):
conn = get_connection() conn = get_connection()
@@ -127,8 +224,9 @@ def insert_task(status, source_url=None, source_id=None, priority=0):
""", (status, source_url, priority)) """, (status, source_url, priority))
task_id = cur.fetchone()[0] task_id = cur.fetchone()[0]
return task_id return task_id
finally: except Exception as e:
pass print(f"Ошибка при создании задачи: {e}")
raise
def get_tasks_offset(limit, offset): def get_tasks_offset(limit, offset):
conn = get_connection() conn = get_connection()
@@ -141,8 +239,9 @@ def get_tasks_offset(limit, offset):
""", (limit, offset)) """, (limit, offset))
tasks = cur.fetchall() tasks = cur.fetchall()
return tasks return tasks
finally: except Exception as e:
pass print(f"Ошибка при получении задач: {e}")
raise
def delete_task(task_id: int): def delete_task(task_id: int):
conn = get_connection() conn = get_connection()
@@ -154,8 +253,9 @@ def delete_task(task_id: int):
return {"message": f"Задача {task_id} удалена", "deleted_task": dict(deleted_task)} return {"message": f"Задача {task_id} удалена", "deleted_task": dict(deleted_task)}
else: else:
return {"message": f"Задача с id {task_id} не найдена"} return {"message": f"Задача с id {task_id} не найдена"}
finally: except Exception as e:
pass print(f"Ошибка при удалении задачи: {e}")
raise
def update_task(task_id, **fields): def update_task(task_id, **fields):
conn = get_connection() conn = get_connection()
@@ -174,8 +274,9 @@ def update_task(task_id, **fields):
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(f"UPDATE work_parser SET {set_sql} WHERE id = %s;", values) cur.execute(f"UPDATE work_parser SET {set_sql} WHERE id = %s;", values)
return True return True
finally: except Exception as e:
pass print(f"Ошибка при обновлении задачи: {e}")
raise
# Создание и работа с таблицей по созданию и редактированию промтов # Создание и работа с таблицей по созданию и редактированию промтов
def create_table_config_gpt(): def create_table_config_gpt():
@@ -189,8 +290,8 @@ def create_table_config_gpt():
); );
""") """)
print("Таблица config_gpt создана или уже существует") print("Таблица config_gpt создана или уже существует")
finally: except Exception as e:
pass print(f"Ошибка при создании таблицы config_gpt: {e}")
def update_promt(name: str, promt: str): def update_promt(name: str, promt: str):
conn = get_connection() conn = get_connection()
@@ -203,8 +304,9 @@ def update_promt(name: str, promt: str):
promt = EXCLUDED.promt promt = EXCLUDED.promt
""", (name, promt)) """, (name, promt))
conn.commit() conn.commit()
finally: except Exception as e:
pass print(f"Ошибка при обновлении промта: {e}")
raise
def get_promt(promt_name_url): def get_promt(promt_name_url):
conn = get_connection() conn = get_connection()
@@ -212,9 +314,12 @@ def get_promt(promt_name_url):
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT promt FROM config_gpt WHERE name = %s", (promt_name_url,)) cur.execute("SELECT promt FROM config_gpt WHERE name = %s", (promt_name_url,))
promt = cur.fetchone() promt = cur.fetchone()
return promt['promt'] if promt:
finally: return promt['promt']
pass return None
except Exception as e:
print(f"Ошибка при получении промта: {e}")
raise
def get_all_promt(): def get_all_promt():
# Возвращает список всех значений поля name из таблицы config_gpt # Возвращает список всех значений поля name из таблицы config_gpt
@@ -226,8 +331,9 @@ def get_all_promt():
sources = [{"name": row["name"], "promt": row["promt"]} for row in rows] sources = [{"name": row["name"], "promt": row["promt"]} for row in rows]
return {"sources": sources} return {"sources": sources}
finally: except Exception as e:
pass print(f"Ошибка при получении всех промтов: {e}")
raise
def get_all_categories_promt(): def get_all_categories_promt():
conn = get_connection() conn = get_connection()
@@ -237,8 +343,9 @@ def get_all_categories_promt():
rows = cur.fetchall() rows = cur.fetchall()
return [row["name"] for row in rows] return [row["name"] for row in rows]
finally: except Exception as e:
pass print(f"Ошибка при получении категорий: {e}")
raise
# Создание, сохранение и работа с таблицей ошибочных ссылок (error_url) # Создание, сохранение и работа с таблицей ошибочных ссылок (error_url)
def create_table_error_url(): def create_table_error_url():
@@ -253,8 +360,8 @@ def create_table_error_url():
); );
""") """)
print("Таблица error_url создана или уже существует") print("Таблица error_url создана или уже существует")
finally: except Exception as e:
pass print(f"Ошибка при создании таблицы error_url: {e}")
def add_error_url(source_url: str, error_sources_url: str): def add_error_url(source_url: str, error_sources_url: str):
conn = get_connection() conn = get_connection()
@@ -266,8 +373,9 @@ def add_error_url(source_url: str, error_sources_url: str):
RETURNING id; RETURNING id;
""", (source_url, error_sources_url)) """, (source_url, error_sources_url))
return cur.fetchone()[0] return cur.fetchone()[0]
finally: except Exception as e:
pass print(f"Ошибка при добавлении error_url: {e}")
raise
def check_error_url(error_sources_url: str) -> bool: def check_error_url(error_sources_url: str) -> bool:
conn = get_connection() conn = get_connection()
@@ -281,8 +389,9 @@ def check_error_url(error_sources_url: str) -> bool:
row = cur.fetchone() row = cur.fetchone()
return row is None return row is None
finally: except Exception as e:
pass print(f"Ошибка при проверке error_url: {e}")
return True
# Создание и работа с таблицей источников sources # Создание и работа с таблицей источников sources
def create_table_add_sourse(): def create_table_add_sourse():
@@ -292,38 +401,85 @@ def create_table_add_sourse():
cur.execute(""" cur.execute("""
CREATE TABLE IF NOT EXISTS sourse ( CREATE TABLE IF NOT EXISTS sourse (
url TEXT PRIMARY KEY, url TEXT PRIMARY KEY,
promt TEXT promt TEXT,
status BOOLEAN DEFAULT FALSE
); );
""") """)
print("Таблица sourse создана или уже существует") print("Таблица sourse создана или уже существует")
finally: except Exception as e:
pass print(f"Ошибка при создании таблицы sourse: {e}")
def add_sources(url: str, promt: str): def add_sources(url: str, promt: str, status: bool = False):
conn = get_connection() conn = get_connection()
try: try:
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute(""" cur.execute("""
INSERT INTO sourse (url, promt) INSERT INTO sourse (url, promt, status)
VALUES (%s, %s) VALUES (%s, %s, %s)
ON CONFLICT (url) DO UPDATE SET ON CONFLICT (url) DO UPDATE SET
promt = EXCLUDED.promt promt = EXCLUDED.promt,
""", (url, promt)) status = EXCLUDED.status
""", (url, promt, status))
conn.commit() conn.commit()
finally: except Exception as e:
pass print(f"Ошибка при добавлении источника: {e}")
raise
def get_all_sources(): def get_all_sources(category: str):
"""Возвращает все записи из таблицы sourse""" """Возвращает все записи из таблицы sourse. Сначала показываются записи со status=false"""
conn = get_connection() conn = get_connection()
try: try:
with conn.cursor(cursor_factory=RealDictCursor) as cur: with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT * FROM sourse") if category == "all":
cur.execute("""
SELECT * FROM sourse
ORDER BY status ASC, url ASC
""")
else:
cur.execute("""
SELECT * FROM sourse
WHERE promt = %s
ORDER BY status ASC, url ASC
""", (category,))
rows = cur.fetchall() rows = cur.fetchall()
sources = [{"url": row["url"], "promt": row["promt"]} for row in rows] sources = [{"url": row["url"], "promt": row["promt"], "status": row["status"]} for row in rows]
return {"sources": sources} return {"sources": sources}
finally: except Exception as e:
pass print(f"Ошибка при получении источников: {e}")
return {"error": str(e), "sources": []}
def get_true_sources():
"""Возвращает все записи из таблицы sourse. Сначала показываются записи со status=true"""
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM sourse
WHERE status = false
""")
rows = cur.fetchall()
sources = {}
for row in rows:
sources.update({row["url"]: row["promt"]})
return sources
except Exception as e:
print(f"Ошибка при получении источников: {e}")
return {"error": str(e), "sources": []}
def update_source_status(url: str, status: bool = True):
"""Обновляет статус источника по URL"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
UPDATE sourse SET status = %s WHERE url = %s
""", (status, url))
updated = cur.rowcount
conn.commit()
return {"message": f"Статус обновлён для {url}", "updated_rows": updated}
except Exception as e:
print(f"Ошибка при обновлении статуса: {e}")
return {"error": str(e), "updated_rows": 0}
def delete_sources(url: str): def delete_sources(url: str):
"""Удаляет источник по URL из таблицы sourse""" """Удаляет источник по URL из таблицы sourse"""
@@ -340,8 +496,6 @@ def delete_sources(url: str):
except Exception as e: except Exception as e:
print(f"Ошибка при удалении источника: {e}") print(f"Ошибка при удалении источника: {e}")
return {"error": str(e), "deleted": False} return {"error": str(e), "deleted": False}
finally:
pass
# Пример использования # Пример использования
# if __name__ == "__main__": # if __name__ == "__main__":