сортировка по странам + перенос работы с бд
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-04-25 16:03:04 +10:00
parent 92cb60aa1d
commit 17343efe8c
3 changed files with 394 additions and 63 deletions

141
main.py
View File

@@ -17,19 +17,22 @@ from bs4 import BeautifulSoup
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from docx import Document from docx import Document
from newspaper import Article from newspaper import Article
from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from pydantic import BaseModel, HttpUrl from pydantic import BaseModel, HttpUrl
from typing import List
from urllib.parse import urljoin, urlparse, urldefrag from urllib.parse import urljoin, urlparse, urldefrag
import uvicorn import uvicorn
import requests import requests
# Локальные импорты # Локальные импорты
# import settings_work as sw import parser_bd as pbd
import work_parser as wp import work_parser as wp
DOCUMENTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documents") DOCUMENTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documents")
@asynccontextmanager @asynccontextmanager
@@ -207,9 +210,9 @@ def extract_text_from_url(url, timeout=10, verify=True):
return "\n".join(content_text), time_t return "\n".join(content_text), time_t
# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://45.129.78.228:8484') GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://45.129.78.228:8484')
# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://172.17.0.1:8484') # GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://172.17.0.1:8484')
GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8484') # GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8484')
def gpt_response_message(content: str, name_promt: str): def gpt_response_message(content: str, name_promt: str):
contentGPT = wp.get_promt(name_promt).replace('{content}', content) contentGPT = wp.get_promt(name_promt).replace('{content}', content)
@@ -248,24 +251,11 @@ def gpt_response_message(content: str, name_promt: str):
# Общие функции проверки ссылок # Общие функции проверки ссылок
def check_url(url): def check_url(url):
try: try:
response = wp.check_url_exists(url) #get('http://45.129.78.228:8002/check_url_exists', params={'url': url}) result = pbd.check_url_exists_in_db(url)
if response.status_code == 200: return result.get("exists", False)
result = response.json()
# print(result["exists"])
return result["exists"]
else:
# print(f"Ошибка: {response.status_code}")
# Если сервис недоступен — считаем, что URL новый (пропускаем)
return False
except requests.exceptions.Timeout:
# print(f"Timeout при проверке URL: {url}")
# logger.warning(f"check_url timeout: {url}")
# Если таймаут — считаем, что URL новый (пропускаем)
return False
except Exception as e: except Exception as e:
# print(f"Ошибка при проверке URL: {e}")
# logger.error(f"check_url error: {e}")
# Если ошибка — считаем, что URL новый (пропускаем) # Если ошибка — считаем, что URL новый (пропускаем)
logger.error(f"check_url error: {e}")
return False return False
# функции даты первого источника (газета) # функции даты первого источника (газета)
@@ -296,7 +286,11 @@ def update_bd_and_create_document(response_text, article_date, url, parsed_at, o
data['status'] = False data['status'] = False
data['viewed'] = False data['viewed'] = False
data['other'] = other data['other'] = other
print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data))
# Заменяем HTTP-запрос на прямой вызов функции через pbd
parsed_data = pbd.ParsedData(**data)
pbd.save_parsed_data_to_db(parsed_data)
print("Данные успешно сохранены в БД")
path_day = article_date.split()[0] path_day = article_date.split()[0]
documents_path = os.path.join(DOCUMENTS_DIR, path_day) documents_path = os.path.join(DOCUMENTS_DIR, path_day)
@@ -364,7 +358,7 @@ def start_pars_one_istochnik(data_init=""):
response_text = gpt_response_message(text, "source1") response_text = gpt_response_message(text, "source1")
print(response_text) print(response_text)
if response_text: if response_text:
update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other=url) update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other="source1")
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow()) wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
@@ -392,7 +386,7 @@ def start_pars_two_istochnik():
response_text = gpt_response_message(text, "source2") response_text = gpt_response_message(text, "source2")
print(response_text) print(response_text)
if response_text: if response_text:
update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other=url) update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other="source2")
except: except:
continue continue
@@ -446,7 +440,7 @@ def start_pars_all_istochnik(url:str, promt:str):
response_text = gpt_response_message(str(article.text), promt) response_text = gpt_response_message(str(article.text), promt)
print(response_text) print(response_text)
if response_text: if response_text:
update_bd_and_create_document(response_text=response_text, article_date=time_text, url=abs_url, parsed_at=str(dt.now()), original_text=article.text, other=url) update_bd_and_create_document(response_text=response_text, article_date=time_text, url=abs_url, parsed_at=str(dt.now()), original_text=article.text, other=promt)
else: else:
wp.add_error_url(url, abs_url) wp.add_error_url(url, abs_url)
except Exception as e: except Exception as e:
@@ -637,6 +631,105 @@ async def get_logs():
lines = file.readlines()[-10:] # последние 10 строк lines = file.readlines()[-10:] # последние 10 строк
return {"logs": lines} return {"logs": lines}
# ==================== Эндпоинты из parser_bd.py ====================
@app.post("/save_parsed_data", summary="Сохранить данные парсинга")
def save_parsed_data(data: pbd.ParsedData):
try:
pbd.save_parsed_data_to_db(data)
return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
@app.post("/update_viewed_status", summary="Обновляет поле viewed")
def update_viewed_status(url: str, viewed: bool):
"""
Обновляет поле 'viewed' записи в БД по заданному URL.
"""
try:
result = pbd.update_viewed_status_in_db(url, viewed)
if not result.get("found"):
raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена")
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
return {"status": "success", "message": "Статус просмотра успешно обновлен"}
@app.post("/update_status_status", summary="Обновляет поле status")
def update_status_status(url: str, status: bool):
"""
Обновляет поле 'status' записи в БД по заданному URL.
"""
try:
result = pbd.update_status_status_in_db(url, status)
if not result.get("found"):
raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена")
except Exception as e:
if isinstance(e, HTTPException):
raise e
raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}")
return {"status": "success", "message": "Статус просмотра успешно обновлен"}
@app.get("/check_url_exists", summary="Проверяет url")
def check_url_exists(url: str):
"""
Проверяет, есть ли указанный URL в базе данных.
Возвращает true, если есть, иначе false.
"""
try:
return pbd.check_url_exists_in_db(url)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}")
@app.get("/records", summary="Получить записи из БД с пагинацией", response_model=List[pbd.ParsedData])
def get_records(offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)):
"""
Возвращает записи из таблицы url с учетом offset и limit.
"""
try:
return pbd.get_records_from_db(offset, limit)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}")
@app.get("/records_all/count", summary="Получить общее количество записей")
def get_records_count(item: str = Query("default")):
"""
Возвращает общее количество записей в таблице.
"""
try:
return pbd.get_records_count_from_db(item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}")
@app.get("/poisk/count", summary="Получить количество результатов поиска")
def get_poisk_count(query: str, item: str = Query("default")):
try:
return pbd.get_poisk_count_from_db(query, item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}")
@app.get("/poisk", summary="Поиск с пагинацией")
def poisk(query: str, offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100), item: str = Query("default")):
try:
return pbd.poisk_from_db(query, offset, limit, item)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при поиске: {e}")
@app.get("/records_all", summary="Получить все записи из БД + сортирует + пагинация", response_model=List[pbd.ParsedData])
def get_records_all(item: str = Query("default"), offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)):
"""
Возвращает записи из таблицы url с учетом фильтрации и пагинации.
"""
try:
return pbd.get_records_all_from_db(item, offset, limit)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}")
if __name__ == "__main__": if __name__ == "__main__":
uvicorn.run("main:app", port=8001, reload=True) uvicorn.run("main:app", port=8001, reload=True)

277
parser_bd.py Normal file
View File

@@ -0,0 +1,277 @@
from pydantic import BaseModel
from typing import List, Optional
import psycopg2
import psycopg2.extras
from fastapi import HTTPException
# Модель для данных, которые приходят в POST
class ParsedData(BaseModel):
url: str
parsed_at: str
title: str
original_text: str
article_date: str
status: Optional[bool] = False
viewed: Optional[bool] = False
other: str
category: str
translation_text: str
short_text: str
# Подключение к БД
def get_connection():
return psycopg2.connect(
dbname="parsed_url",
user="postgres",
password="qwertyqwerty123123",
host="45.129.78.228"
)
# Функции для работы с БД (без эндпоинтов)
def save_parsed_data_to_db(data: ParsedData):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
cur.execute("""
INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, other, category, translation_text, short_text)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
parsed_at = EXCLUDED.parsed_at,
title = EXCLUDED.title,
original_text = EXCLUDED.original_text,
article_date = EXCLUDED.article_date,
status = EXCLUDED.status,
viewed = EXCLUDED.viewed,
other = EXCLUDED.other,
category = EXCLUDED.category,
translation_text = EXCLUDED.translation_text,
short_text = EXCLUDED.short_text;
""", (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.other, data.category, data.translation_text, data.short_text))
conn.commit()
return {"status": "success", "message": "Данные успешно сохранены"}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def update_viewed_status_in_db(url: str, viewed: bool):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"""
UPDATE url
SET viewed = %s
WHERE url = %s
""",
(viewed, url)
)
if cursor.rowcount == 0:
return {"found": False}
conn.commit()
return {"found": True}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def update_status_status_in_db(url: str, status: bool):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"""
UPDATE url
SET status = %s
WHERE url = %s
""",
(status, url)
)
if cursor.rowcount == 0:
return {"found": False}
conn.commit()
return {"found": True}
except Exception as e:
if conn:
conn.rollback()
raise e
finally:
if conn:
conn.close()
def check_url_exists_in_db(url: str):
conn = None
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"SELECT 1 FROM url WHERE url = %s LIMIT 1",
(url,)
)
result = cursor.fetchone()
return {"exists": bool(result)}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_from_db(offset: int = 0, limit: int = 10):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute("""
SELECT url, parsed_at, title, original_text, article_date, status, viewed, other, category, translation_text, short_text
FROM url
ORDER BY parsed_at DESC NULLS LAST
OFFSET %s LIMIT %s
""", (offset, limit))
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_count_from_db(item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
if item == "viewed":
cur.execute("SELECT COUNT(*) FROM url WHERE viewed = true")
elif item == "status":
cur.execute("SELECT COUNT(*) FROM url WHERE status = true")
elif item == "time":
cur.execute("SELECT COUNT(*) FROM url")
else:
cur.execute("SELECT COUNT(*) FROM url")
result = cur.fetchone()
return {"count": result[0]}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_poisk_count_from_db(query: str, item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor() as cur:
search_pattern = f"%{query}%"
base_query = """SELECT COUNT(*) FROM url WHERE (
title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s
OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s
)"""
params = [search_pattern] * 7
if item == "viewed":
base_query += " AND viewed = true"
elif item == "status":
base_query += " AND status = true"
cur.execute(base_query, params)
result = cur.fetchone()
return {"count": result[0]}
except Exception as e:
raise e
finally:
if conn:
conn.close()
def poisk_from_db(query: str, offset: int = 0, limit: int = 10, item: str = "default"):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
search_pattern = f"%{query}%"
base_query = """SELECT * FROM url WHERE (
title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s
OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s
)"""
params = [search_pattern] * 7
if item == "viewed":
base_query += " AND viewed = true"
elif item == "status":
base_query += " AND status = true"
base_query += " ORDER BY article_date DESC OFFSET %s LIMIT %s"
params.extend([offset, limit])
cur.execute(base_query, params)
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()
def get_records_all_from_db(item: str = "default", offset: int = 0, limit: int = 10):
conn = None
try:
conn = get_connection()
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
if item == "time":
cur.execute("""
SELECT * FROM url
ORDER BY viewed ASC, parsed_at DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "viewed":
cur.execute("""
SELECT * FROM url
WHERE viewed = true
ORDER BY article_date DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "status":
cur.execute("""
SELECT * FROM url
WHERE status = true
ORDER BY parsed_at DESC
OFFSET %s LIMIT %s
""", (offset, limit))
elif item == "Япония" or item == "Корея":
cur.execute("""
SELECT * FROM url
WHERE other = %s
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", (item, offset, limit))
elif item == "Китай":
cur.execute("""
SELECT * FROM url
WHERE other IN (%s, %s, %s)
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", ("source1", "source2", "Китай", offset, limit))
else:
cur.execute("""
SELECT * FROM url
ORDER BY viewed ASC, article_date DESC
OFFSET %s LIMIT %s
""", (offset, limit))
rows = cur.fetchall()
results = [dict(row) for row in rows]
return results
except Exception as e:
raise e
finally:
if conn:
conn.close()

View File

@@ -1,39 +0,0 @@
from pydantic import BaseModel
from typing import List
import json
# Модель для источника
class Source(BaseModel):
name: str
url: str
prompt: str
# Модель для настроек (список источников)
class Settings(BaseModel):
sources: List[Source]
# Путь к файлу с настройками
SETTINGS_FILE = "config.json"
# Чтение настроек из файла
def read_settings() -> Settings:
try:
with open(SETTINGS_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
return Settings(**data)
except (FileNotFoundError, json.JSONDecodeError):
return Settings(sources=[])
# Запись настроек в файл
def write_settings(settings: Settings):
with open(SETTINGS_FILE, "w", encoding="utf-8") as f:
json.dump(settings.dict(), f, ensure_ascii=False, indent=2)
# Обновление данных по источнику
def update_source(new_source: Source) -> dict:
settings = read_settings()
for i, source in enumerate(settings.sources):
if source.name == new_source.name:
settings.sources[i] = new_source
write_settings(settings)
return {"code": 0, "message": f"Источник '{new_source.name}' успешно обновлен."}
return {"code": 1, "message": f"Источник с именем '{new_source.name}' не найден."}