Добавление возможности сохранения документом + вынос формирование json файла в отдельную функцию

This commit is contained in:
2026-03-01 22:40:13 +10:00
parent ad07f20c05
commit ba523340df
2 changed files with 129 additions and 82 deletions

3
.gitignore vendored
View File

@@ -8,7 +8,8 @@ __pycache__/
# Виртуальное окружение # Виртуальное окружение
.venv/ .venv/
save/ save/
2026/
2027/
# IDE # IDE
.vscode/ .vscode/
.idea/ .idea/

208
main.py
View File

@@ -1,25 +1,45 @@
from fastapi import FastAPI, Request, BackgroundTasks, Query # Стандартные библиотеки (stdlib)
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from apscheduler.schedulers.asyncio import AsyncIOScheduler
import logging
import subprocess
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json import json
from datetime import datetime as dt import logging
import uvicorn import os
import subprocess
import time import time
from datetime import datetime as dt
from datetime import datetime from datetime import datetime
# Сторонние библиотеки (third-party)
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from bs4 import BeautifulSoup
from contextlib import asynccontextmanager
from docx import Document
from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from pydantic import BaseModel
from urllib.parse import urljoin
import uvicorn
import requests
# Локальные импорты
import settings_work as sw import settings_work as sw
import work_parser as wp import work_parser as wp
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Управление жизненным циклом приложения"""
# Startup
scheduler.add_job(scheduled_parser_1, "cron", hour=10, minute=0)
scheduler.add_job(scheduled_parser_2, "cron", hour=11, minute=0)
scheduler.start()
yield
# Shutdown
scheduler.shutdown()
app = FastAPI(title="Parser API", app = FastAPI(title="Parser API",
description="API для запуска парсинга в базу данных", description="API для запуска парсинга в базу данных",
version="1.0") version="1.0",
lifespan=lifespan)
# Инициализация планировщика # Инициализация планировщика
scheduler = AsyncIOScheduler() scheduler = AsyncIOScheduler()
@@ -31,7 +51,7 @@ logger = logging.getLogger(__name__)
@app.get("/logs") @app.get("/logs")
def get_logs(): def get_logs():
with open("app.log", "r") as file: with open("app.log", "r") as file:
lines = file.readlines()[-10:] # последние 100 строк lines = file.readlines()[-10:] # последние 10 строк
return {"logs": lines} return {"logs": lines}
# Инициализация таблицы статуса парсинга # Инициализация таблицы статуса парсинга
@@ -45,10 +65,8 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt" PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt"
def download_proxies(url): def download_proxies(url):
response = requests.get(url) response = requests.get(url)
if response.status_code == 200: if response.status_code == 200:
@@ -57,7 +75,6 @@ def download_proxies(url):
else: else:
return [] return []
def fetch_with_proxy(url, proxy, verify, timeout): def fetch_with_proxy(url, proxy, verify, timeout):
proxies = { proxies = {
'http': f'http://{proxy}', # или 'socks5://' если SOCKS5 и т.п. 'http': f'http://{proxy}', # или 'socks5://' если SOCKS5 и т.п.
@@ -71,7 +88,6 @@ def fetch_with_proxy(url, proxy, verify, timeout):
except: except:
return None return None
# Общие функции нахождения ссылок # Общие функции нахождения ссылок
def extract_map_area_hrefs(url, verify=True, ist_number=1): def extract_map_area_hrefs(url, verify=True, ist_number=1):
headers = { headers = {
@@ -102,7 +118,6 @@ def extract_map_area_hrefs(url, verify=True, ist_number=1):
# функции парсера первого источника (газета) # функции парсера первого источника (газета)
def extract_text_from_url_one(url, timeout=10, verify=True): def extract_text_from_url_one(url, timeout=10, verify=True):
proxies_list = download_proxies(PROXIES_URL) proxies_list = download_proxies(PROXIES_URL)
response = "" response = ""
@@ -115,7 +130,6 @@ def extract_text_from_url_one(url, timeout=10, verify=True):
soup = BeautifulSoup(response, "html.parser") soup = BeautifulSoup(response, "html.parser")
title_div = soup.find('div', class_='newsdetatit') title_div = soup.find('div', class_='newsdetatit')
title_text = '' title_text = ''
if title_div: if title_div:
@@ -155,7 +169,7 @@ def extract_text_from_url(url, timeout=10, verify=True):
# Находим контейнер div.whitecon.article # Находим контейнер div.whitecon.article
container = soup.find("div", class_="whitecon article") container = soup.find("div", class_="whitecon article")
if not container: if not container:
return "" return "", ""
# Получение заголовка <time> внутри контейнера # Получение заголовка <time> внутри контейнера
time_text = container.find('span') time_text = container.find('span')
@@ -168,12 +182,11 @@ def extract_text_from_url(url, timeout=10, verify=True):
# Возвращаем текстовую сводку # Возвращаем текстовую сводку
content_text = [] content_text = []
for p in paragraphs: for p in paragraphs:
if p.get('class') != ['before_ir']: if p.get('class') != ['before_ir'] :
content_text.append(p.get_text(strip=True)) content_text.append(p.get_text(strip=True))
return "\n".join(content_text), time_t return "\n".join(content_text), time_t
# Общий запрос на GPT # Общий запрос на GPT
def gpt_response_message(content, ist_number=1): def gpt_response_message(content, ist_number=1):
@@ -183,7 +196,6 @@ def gpt_response_message(content, ist_number=1):
contentGPT = Promts[0].prompt.replace('{content}', content) contentGPT = Promts[0].prompt.replace('{content}', content)
else: else:
contentGPT = Promts[1].prompt.replace('{content}', content) contentGPT = Promts[1].prompt.replace('{content}', content)
url = 'http://45.129.78.228:8484' #10.8.0.14:5500 url = 'http://45.129.78.228:8484' #10.8.0.14:5500
params = {'text': contentGPT} params = {'text': contentGPT}
@@ -242,6 +254,50 @@ def create_folder(num):
num = str(num) num = str(num)
return num return num
# Функция формирования документа
def update_bd_and_create_document(response_text, article_date, url, parsed_at, original_text, other):
clean_response = ''
try:
clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data = json.loads(clean_response)
if data['category']:
data['article_date'] = article_date
data['url'] = url
data['parsed_at'] = parsed_at
data['original_text'] = original_text
data['status'] = False
data['viewed'] = False
data['other'] = other
print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data))
path_day = article_date.split()[0]
if not os.path.exists(path_day):
os.makedirs(path_day)
print(f"Создана папка: {path_day}")
doc = Document()
doc.add_heading('Ссылка на статью', level=1)
doc.add_paragraph(other)
doc.add_heading('Дата и время', level=1)
doc.add_paragraph(article_date)
doc.add_heading('Обноруженные тематики текста', level=1)
doc.add_paragraph(data["category"])
doc.add_heading('Заголовок', level=1)
doc.add_paragraph(data["title"])
doc.add_heading('Краткий пересказ', level=1)
doc.add_paragraph(data["short_text"])
doc.add_heading('Переведенный текст статьи в газете', level=1)
doc.add_paragraph(data["translation_text"])
doc.add_heading('Оригинальный текст', level=1)
doc.add_paragraph(original_text)
doc_name = f"{data['title']}.docx"
doc_path = os.path.join(path_day, doc_name)
doc.save(doc_path)
print(f"Сохранен документ: {doc_path}")
except Exception as ex:
print(f"Ошибка при обработке ответа GPT: {ex}")
logger.info(f"Ошибка при обработке ответа GPT: {ex}")
#Функции start первого источника (газета) #Функции start первого источника (газета)
def start_pars_one_istochnik(data_init): def start_pars_one_istochnik(data_init):
@@ -261,13 +317,13 @@ def start_pars_one_istochnik(data_init):
for page_number in range(1, 9): for page_number in range(1, 9):
start_url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html' url = f'http://epaper.hljnews.cn/hljrb/pc/layout/{current_year}{current_month}/{current_day}/node_0{page_number}.html'
wp.update_task(task_id, status='in_progress', source_url=start_url, started_at=datetime.utcnow()) wp.update_task(task_id, status='in_progress', source_url=url, started_at=datetime.utcnow())
print(f"Сбор href из: {start_url}") print(f"Сбор href из: {url}")
try: try:
hrefs = extract_map_area_hrefs(start_url, ist_number=2) hrefs = extract_map_area_hrefs(url, ist_number=2)
except Exception as e: except Exception as e:
print(f"Ошибка при извлечении ссылок: {e}") print(f"Ошибка при извлечении ссылок: {e}")
logger.info(f"extract_map_area_hrefs: {e}") logger.info(f"extract_map_area_hrefs: {e}")
@@ -280,24 +336,24 @@ def start_pars_one_istochnik(data_init):
if len(text) >= 100: if len(text) >= 100:
response_text = gpt_response_message(text, ist_number=2) response_text = gpt_response_message(text, ist_number=2)
print(response_text) print(response_text)
clean_response = '' update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other=url)
try: # clean_response = ''
clean_response = response_text.strip().replace('```json', '').replace('```', '').strip() # try:
data = json.loads(clean_response) # clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data['article_date'] = f"{current_day}/{current_month}/{current_year}" # data = json.loads(clean_response)
data['url'] = link # data['article_date'] = f"{current_day}/{current_month}/{current_year}"
data['parsed_at'] = str(dt.now()) # data['url'] = link
data['original_text'] = text # data['parsed_at'] = str(dt.now())
data['status'] = False # data['original_text'] = text
data['viewed'] = False # data['status'] = False
data['other'] = start_url # data['viewed'] = False
# data['other'] = url
if data['category']: # if data['category']:
print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data)) # print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data))
except Exception as ex: # except Exception as ex:
print(f"Ошибка при обработке ответа GPT: {ex}") # print(f"Ошибка при обработке ответа GPT: {ex}")
logger.info(f"gpt_response_message: {ex}") # logger.info(f"gpt_response_message: {ex}")
continue # continue
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow()) wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
@@ -324,31 +380,29 @@ def start_pars_two_istochnik():
if len(text) >= 100: if len(text) >= 100:
response_text = gpt_response_message(text) response_text = gpt_response_message(text)
print(response_text) print(response_text)
clean_response = '' update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other=url)
try: # clean_response = ''
clean_response = response_text.strip().replace('json', '').replace('', '').strip() # try:
data = json.loads(clean_response) # clean_response = response_text.strip().replace('```json', '').replace('```', '').strip()
data['article_date'] = time_text # data = json.loads(clean_response)
data['url'] = hrefs # data['article_date'] = time_text
data['parsed_at'] = str(dt.now()) # data['url'] = hrefs
data['original_text'] = text # data['parsed_at'] = str(dt.now())
data['status'] = False # data['original_text'] = text
data['viewed'] = False # data['status'] = False
data['other'] = url # data['viewed'] = False
# print[date] # data['other'] = url
if data['category']: # if data['category']:
print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data)) # print(requests.post('http://45.129.78.228:8002/save_parsed_data', json=data))
except Exception as ex: # except Exception as ex:
print(f"Ошибка при обработке ответа GPT: {ex}") # print(f"Ошибка при обработке ответа GPT: {ex}")
logger.info(f"Ошибка при обработке ответа GPT: {ex}") # logger.info(f"Ошибка при обработке ответа GPT: {ex}")
continue # continue
except: except:
continue continue
wp.update_task(task_id, status='completed', finished_at=datetime.utcnow()) wp.update_task(task_id, status='completed', finished_at=datetime.utcnow())
# Функции для автоматического запуска # Функции для автоматического запуска
@@ -361,19 +415,6 @@ def scheduled_parser_2():
"""Планировщик для второго парсера""" """Планировщик для второго парсера"""
start_pars_two_istochnik() start_pars_two_istochnik()
@app.on_event("startup")
async def start_scheduler():
"""Запуск планировщика при старте приложения"""
scheduler.add_job(scheduled_parser_1, "cron", hour=10, minute=0)
scheduler.add_job(scheduled_parser_2, "cron", hour=11, minute=0)
scheduler.start()
@app.on_event("shutdown")
async def stop_scheduler():
"""Остановка планировщика при выключении"""
scheduler.shutdown()
class ParserOneRequest(BaseModel): class ParserOneRequest(BaseModel):
time: str time: str
@@ -407,5 +448,10 @@ def set_settings(settings: sw.Source):
def delete_task(task_id: int): def delete_task(task_id: int):
return print(wp.delete_task(task_id)) return print(wp.delete_task(task_id))
# if __name__ == "__main__": @app.get("/file_download", summary="Метод для скачивания файла")
# uvicorn.run("main:app", port=8001, reload=True) async def download_file(path: str, title: str):
path = f"./{path}/{title}.docx" #os.path.abspath(path)
return FileResponse(path=path, filename=f'{title}.docx', media_type='multipart/form-data')
if __name__ == "__main__":
uvicorn.run("main:app", port=8001, reload=True)