From 17343efe8cc91aaca54a4d9579bcb0737ebc1f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=98=D0=B3=D0=BE=D1=80=D1=8C=20=D0=91=D0=B0=D0=BD=D0=B4?= =?UTF-8?q?=D1=83=D1=80=D0=B8=D1=81=D1=82?= Date: Sat, 25 Apr 2026 16:03:04 +1000 Subject: [PATCH] =?UTF-8?q?=D1=81=D0=BE=D1=80=D1=82=D0=B8=D1=80=D0=BE?= =?UTF-8?q?=D0=B2=D0=BA=D0=B0=20=D0=BF=D0=BE=20=D1=81=D1=82=D1=80=D0=B0?= =?UTF-8?q?=D0=BD=D0=B0=D0=BC=20+=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BD=D0=BE?= =?UTF-8?q?=D1=81=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D1=8B=20=D1=81=20=D0=B1?= =?UTF-8?q?=D0=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 141 ++++++++++++++++++++---- parser_bd.py | 277 +++++++++++++++++++++++++++++++++++++++++++++++ settings_work.py | 39 ------- 3 files changed, 394 insertions(+), 63 deletions(-) create mode 100644 parser_bd.py delete mode 100644 settings_work.py diff --git a/main.py b/main.py index 45224d0..194291b 100644 --- a/main.py +++ b/main.py @@ -17,19 +17,22 @@ from bs4 import BeautifulSoup from contextlib import asynccontextmanager from docx import Document from newspaper import Article -from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends +from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse from pydantic import BaseModel, HttpUrl +from typing import List from urllib.parse import urljoin, urlparse, urldefrag import uvicorn import requests # Локальные импорты -# import settings_work as sw +import parser_bd as pbd import work_parser as wp + + DOCUMENTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documents") @asynccontextmanager @@ -207,9 +210,9 @@ def extract_text_from_url(url, timeout=10, verify=True): return "\n".join(content_text), time_t -# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://45.129.78.228:8484') +GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://45.129.78.228:8484') # GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://172.17.0.1:8484') -GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8484') +# GPT_SERVER_URL = os.getenv('GPT_SERVER_URL', 'http://127.0.0.1:8484') def gpt_response_message(content: str, name_promt: str): contentGPT = wp.get_promt(name_promt).replace('{content}', content) @@ -248,24 +251,11 @@ def gpt_response_message(content: str, name_promt: str): # Общие функции проверки ссылок def check_url(url): try: - response = wp.check_url_exists(url) #get('http://45.129.78.228:8002/check_url_exists', params={'url': url}) - if response.status_code == 200: - result = response.json() - # print(result["exists"]) - return result["exists"] - else: - # print(f"Ошибка: {response.status_code}") - # Если сервис недоступен — считаем, что URL новый (пропускаем) - return False - except requests.exceptions.Timeout: - # print(f"Timeout при проверке URL: {url}") - # logger.warning(f"check_url timeout: {url}") - # Если таймаут — считаем, что URL новый (пропускаем) - return False + result = pbd.check_url_exists_in_db(url) + return result.get("exists", False) except Exception as e: - # print(f"Ошибка при проверке URL: {e}") - # logger.error(f"check_url error: {e}") # Если ошибка — считаем, что URL новый (пропускаем) + logger.error(f"check_url error: {e}") return False # функции даты первого источника (газета) @@ -296,7 +286,11 @@ def update_bd_and_create_document(response_text, article_date, url, parsed_at, o data['status'] = False data['viewed'] = False data['other'] = other - print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data)) + + # Заменяем HTTP-запрос на прямой вызов функции через pbd + parsed_data = pbd.ParsedData(**data) + pbd.save_parsed_data_to_db(parsed_data) + print("Данные успешно сохранены в БД") path_day = article_date.split()[0] documents_path = os.path.join(DOCUMENTS_DIR, path_day) @@ -364,7 +358,7 @@ def start_pars_one_istochnik(data_init=""): response_text = gpt_response_message(text, "source1") print(response_text) if response_text: - update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other=url) + update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other="source1") wp.update_task(task_id, status='completed', finished_at=datetime.utcnow()) @@ -392,7 +386,7 @@ def start_pars_two_istochnik(): response_text = gpt_response_message(text, "source2") print(response_text) if response_text: - update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other=url) + update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other="source2") except: continue @@ -446,7 +440,7 @@ def start_pars_all_istochnik(url:str, promt:str): response_text = gpt_response_message(str(article.text), promt) print(response_text) if response_text: - update_bd_and_create_document(response_text=response_text, article_date=time_text, url=abs_url, parsed_at=str(dt.now()), original_text=article.text, other=url) + update_bd_and_create_document(response_text=response_text, article_date=time_text, url=abs_url, parsed_at=str(dt.now()), original_text=article.text, other=promt) else: wp.add_error_url(url, abs_url) except Exception as e: @@ -637,6 +631,105 @@ async def get_logs(): lines = file.readlines()[-10:] # последние 10 строк return {"logs": lines} + +# ==================== Эндпоинты из parser_bd.py ==================== + +@app.post("/save_parsed_data", summary="Сохранить данные парсинга") +def save_parsed_data(data: pbd.ParsedData): + try: + pbd.save_parsed_data_to_db(data) + return {"status": "success", "message": "Данные успешно сохранены"} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}") + +@app.post("/update_viewed_status", summary="Обновляет поле viewed") +def update_viewed_status(url: str, viewed: bool): + """ + Обновляет поле 'viewed' записи в БД по заданному URL. + """ + try: + result = pbd.update_viewed_status_in_db(url, viewed) + if not result.get("found"): + raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена") + except Exception as e: + if isinstance(e, HTTPException): + raise e + raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}") + + return {"status": "success", "message": "Статус просмотра успешно обновлен"} + +@app.post("/update_status_status", summary="Обновляет поле status") +def update_status_status(url: str, status: bool): + """ + Обновляет поле 'status' записи в БД по заданному URL. + """ + try: + result = pbd.update_status_status_in_db(url, status) + if not result.get("found"): + raise HTTPException(status_code=404, detail="Запись с указанным URL не найдена") + except Exception as e: + if isinstance(e, HTTPException): + raise e + raise HTTPException(status_code=500, detail=f"Ошибка при сохранении данных: {e}") + + return {"status": "success", "message": "Статус просмотра успешно обновлен"} + +@app.get("/check_url_exists", summary="Проверяет url") +def check_url_exists(url: str): + """ + Проверяет, есть ли указанный URL в базе данных. + Возвращает true, если есть, иначе false. + """ + try: + return pbd.check_url_exists_in_db(url) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}") + +@app.get("/records", summary="Получить записи из БД с пагинацией", response_model=List[pbd.ParsedData]) +def get_records(offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)): + """ + Возвращает записи из таблицы url с учетом offset и limit. + """ + try: + return pbd.get_records_from_db(offset, limit) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}") + +@app.get("/records_all/count", summary="Получить общее количество записей") +def get_records_count(item: str = Query("default")): + """ + Возвращает общее количество записей в таблице. + """ + try: + return pbd.get_records_count_from_db(item) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}") + +@app.get("/poisk/count", summary="Получить количество результатов поиска") +def get_poisk_count(query: str, item: str = Query("default")): + try: + return pbd.get_poisk_count_from_db(query, item) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при получении количества: {e}") + +@app.get("/poisk", summary="Поиск с пагинацией") +def poisk(query: str, offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100), item: str = Query("default")): + try: + return pbd.poisk_from_db(query, offset, limit, item) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при поиске: {e}") + +@app.get("/records_all", summary="Получить все записи из БД + сортирует + пагинация", response_model=List[pbd.ParsedData]) +def get_records_all(item: str = Query("default"), offset: int = Query(0, ge=0), limit: int = Query(10, ge=1, le=100)): + """ + Возвращает записи из таблицы url с учетом фильтрации и пагинации. + """ + try: + return pbd.get_records_all_from_db(item, offset, limit) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Ошибка при получении записей из БД: {e}") + + if __name__ == "__main__": uvicorn.run("main:app", port=8001, reload=True) diff --git a/parser_bd.py b/parser_bd.py new file mode 100644 index 0000000..b31b237 --- /dev/null +++ b/parser_bd.py @@ -0,0 +1,277 @@ +from pydantic import BaseModel +from typing import List, Optional +import psycopg2 +import psycopg2.extras +from fastapi import HTTPException + +# Модель для данных, которые приходят в POST +class ParsedData(BaseModel): + url: str + parsed_at: str + title: str + original_text: str + article_date: str + status: Optional[bool] = False + viewed: Optional[bool] = False + other: str + category: str + translation_text: str + short_text: str + +# Подключение к БД +def get_connection(): + return psycopg2.connect( + dbname="parsed_url", + user="postgres", + password="qwertyqwerty123123", + host="45.129.78.228" + ) + +# Функции для работы с БД (без эндпоинтов) +def save_parsed_data_to_db(data: ParsedData): + conn = None + try: + conn = get_connection() + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO url (url, parsed_at, title, original_text, article_date, status, viewed, other, category, translation_text, short_text) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (url) DO UPDATE SET + parsed_at = EXCLUDED.parsed_at, + title = EXCLUDED.title, + original_text = EXCLUDED.original_text, + article_date = EXCLUDED.article_date, + status = EXCLUDED.status, + viewed = EXCLUDED.viewed, + other = EXCLUDED.other, + category = EXCLUDED.category, + translation_text = EXCLUDED.translation_text, + short_text = EXCLUDED.short_text; + """, (data.url, data.parsed_at, data.title, data.original_text, data.article_date, data.status, data.viewed, data.other, data.category, data.translation_text, data.short_text)) + conn.commit() + return {"status": "success", "message": "Данные успешно сохранены"} + except Exception as e: + if conn: + conn.rollback() + raise e + finally: + if conn: + conn.close() + +def update_viewed_status_in_db(url: str, viewed: bool): + conn = None + try: + conn = get_connection() + with conn.cursor() as cursor: + cursor.execute( + """ + UPDATE url + SET viewed = %s + WHERE url = %s + """, + (viewed, url) + ) + if cursor.rowcount == 0: + return {"found": False} + conn.commit() + return {"found": True} + except Exception as e: + if conn: + conn.rollback() + raise e + finally: + if conn: + conn.close() + +def update_status_status_in_db(url: str, status: bool): + conn = None + try: + conn = get_connection() + with conn.cursor() as cursor: + cursor.execute( + """ + UPDATE url + SET status = %s + WHERE url = %s + """, + (status, url) + ) + if cursor.rowcount == 0: + return {"found": False} + conn.commit() + return {"found": True} + except Exception as e: + if conn: + conn.rollback() + raise e + finally: + if conn: + conn.close() + +def check_url_exists_in_db(url: str): + conn = None + try: + conn = get_connection() + with conn.cursor() as cursor: + cursor.execute( + "SELECT 1 FROM url WHERE url = %s LIMIT 1", + (url,) + ) + result = cursor.fetchone() + return {"exists": bool(result)} + except Exception as e: + raise e + finally: + if conn: + conn.close() + +def get_records_from_db(offset: int = 0, limit: int = 10): + conn = None + try: + conn = get_connection() + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + cur.execute(""" + SELECT url, parsed_at, title, original_text, article_date, status, viewed, other, category, translation_text, short_text + FROM url + ORDER BY parsed_at DESC NULLS LAST + OFFSET %s LIMIT %s + """, (offset, limit)) + rows = cur.fetchall() + results = [dict(row) for row in rows] + return results + except Exception as e: + raise e + finally: + if conn: + conn.close() + +def get_records_count_from_db(item: str = "default"): + conn = None + try: + conn = get_connection() + with conn.cursor() as cur: + if item == "viewed": + cur.execute("SELECT COUNT(*) FROM url WHERE viewed = true") + elif item == "status": + cur.execute("SELECT COUNT(*) FROM url WHERE status = true") + elif item == "time": + cur.execute("SELECT COUNT(*) FROM url") + else: + cur.execute("SELECT COUNT(*) FROM url") + result = cur.fetchone() + return {"count": result[0]} + except Exception as e: + raise e + finally: + if conn: + conn.close() + +def get_poisk_count_from_db(query: str, item: str = "default"): + conn = None + try: + conn = get_connection() + with conn.cursor() as cur: + search_pattern = f"%{query}%" + base_query = """SELECT COUNT(*) FROM url WHERE ( + title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s + OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s + )""" + params = [search_pattern] * 7 + + if item == "viewed": + base_query += " AND viewed = true" + elif item == "status": + base_query += " AND status = true" + + cur.execute(base_query, params) + result = cur.fetchone() + return {"count": result[0]} + except Exception as e: + raise e + finally: + if conn: + conn.close() + +def poisk_from_db(query: str, offset: int = 0, limit: int = 10, item: str = "default"): + conn = None + try: + conn = get_connection() + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + search_pattern = f"%{query}%" + base_query = """SELECT * FROM url WHERE ( + title ILIKE %s OR original_text ILIKE %s OR translation_text ILIKE %s + OR short_text ILIKE %s OR url ILIKE %s OR category ILIKE %s OR other ILIKE %s + )""" + params = [search_pattern] * 7 + + if item == "viewed": + base_query += " AND viewed = true" + elif item == "status": + base_query += " AND status = true" + + base_query += " ORDER BY article_date DESC OFFSET %s LIMIT %s" + params.extend([offset, limit]) + + cur.execute(base_query, params) + rows = cur.fetchall() + results = [dict(row) for row in rows] + return results + except Exception as e: + raise e + finally: + if conn: + conn.close() + +def get_records_all_from_db(item: str = "default", offset: int = 0, limit: int = 10): + conn = None + try: + conn = get_connection() + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + if item == "time": + cur.execute(""" + SELECT * FROM url + ORDER BY viewed ASC, parsed_at DESC + OFFSET %s LIMIT %s + """, (offset, limit)) + elif item == "viewed": + cur.execute(""" + SELECT * FROM url + WHERE viewed = true + ORDER BY article_date DESC + OFFSET %s LIMIT %s + """, (offset, limit)) + elif item == "status": + cur.execute(""" + SELECT * FROM url + WHERE status = true + ORDER BY parsed_at DESC + OFFSET %s LIMIT %s + """, (offset, limit)) + elif item == "Япония" or item == "Корея": + cur.execute(""" + SELECT * FROM url + WHERE other = %s + ORDER BY viewed ASC, article_date DESC + OFFSET %s LIMIT %s + """, (item, offset, limit)) + elif item == "Китай": + cur.execute(""" + SELECT * FROM url + WHERE other IN (%s, %s, %s) + ORDER BY viewed ASC, article_date DESC + OFFSET %s LIMIT %s + """, ("source1", "source2", "Китай", offset, limit)) + else: + cur.execute(""" + SELECT * FROM url + ORDER BY viewed ASC, article_date DESC + OFFSET %s LIMIT %s + """, (offset, limit)) + rows = cur.fetchall() + results = [dict(row) for row in rows] + return results + except Exception as e: + raise e + finally: + if conn: + conn.close() \ No newline at end of file diff --git a/settings_work.py b/settings_work.py deleted file mode 100644 index 94e56d0..0000000 --- a/settings_work.py +++ /dev/null @@ -1,39 +0,0 @@ -from pydantic import BaseModel -from typing import List -import json -# Модель для источника -class Source(BaseModel): - name: str - url: str - prompt: str - -# Модель для настроек (список источников) -class Settings(BaseModel): - sources: List[Source] - -# Путь к файлу с настройками -SETTINGS_FILE = "config.json" - -# Чтение настроек из файла -def read_settings() -> Settings: - try: - with open(SETTINGS_FILE, "r", encoding="utf-8") as f: - data = json.load(f) - return Settings(**data) - except (FileNotFoundError, json.JSONDecodeError): - return Settings(sources=[]) - -# Запись настроек в файл -def write_settings(settings: Settings): - with open(SETTINGS_FILE, "w", encoding="utf-8") as f: - json.dump(settings.dict(), f, ensure_ascii=False, indent=2) - -# Обновление данных по источнику -def update_source(new_source: Source) -> dict: - settings = read_settings() - for i, source in enumerate(settings.sources): - if source.name == new_source.name: - settings.sources[i] = new_source - write_settings(settings) - return {"code": 0, "message": f"Источник '{new_source.name}' успешно обновлен."} - return {"code": 1, "message": f"Источник с именем '{new_source.name}' не найден."} \ No newline at end of file