From ba523340df30498547d5a5da87f331914ad10244 Mon Sep 17 00:00:00 2001 From: Yro Boros Date: Sun, 1 Mar 2026 22:40:13 +1000 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D0=B2=D0=BE=D0=B7=D0=BC=D0=BE=D0=B6=D0=BD?= =?UTF-8?q?=D0=BE=D1=81=D1=82=D0=B8=20=D1=81=D0=BE=D1=85=D1=80=D0=B0=D0=BD?= =?UTF-8?q?=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=B4=D0=BE=D0=BA=D1=83=D0=BC=D0=B5?= =?UTF-8?q?=D0=BD=D1=82=D0=BE=D0=BC=20+=20=D0=B2=D1=8B=D0=BD=D0=BE=D1=81?= =?UTF-8?q?=20=D1=84=D0=BE=D1=80=D0=BC=D0=B8=D1=80=D0=BE=D0=B2=D0=B0=D0=BD?= =?UTF-8?q?=D0=B8=D0=B5=20json=20=D1=84=D0=B0=D0=B9=D0=BB=D0=B0=20=D0=B2?= =?UTF-8?q?=20=D0=BE=D1=82=D0=B4=D0=B5=D0=BB=D1=8C=D0=BD=D1=83=D1=8E=20?= =?UTF-8?q?=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8=D1=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +- main.py | 208 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 129 insertions(+), 82 deletions(-) diff --git a/.gitignore b/.gitignore index 21f898a..7c96b6d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,8 @@ __pycache__/ # Виртуальное окружение .venv/ save/ - +2026/ +2027/ # IDE .vscode/ .idea/ diff --git a/main.py b/main.py index 5ad6075..ed9883b 100644 --- a/main.py +++ b/main.py @@ -1,25 +1,45 @@ -from fastapi import FastAPI, Request, BackgroundTasks, Query -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from apscheduler.schedulers.asyncio import AsyncIOScheduler -import logging -import subprocess -import requests -from bs4 import BeautifulSoup -from urllib.parse import urljoin +# Стандартные библиотеки (stdlib) import json -from datetime import datetime as dt -import uvicorn - +import logging +import os +import subprocess import time +from datetime import datetime as dt from datetime import datetime +# Сторонние библиотеки (third-party) +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from bs4 import BeautifulSoup +from contextlib import asynccontextmanager +from docx import Document +from fastapi import BackgroundTasks, FastAPI, Query, Request, Depends +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse +from pydantic import BaseModel +from urllib.parse import urljoin +import uvicorn + +import requests + +# Локальные импорты import settings_work as sw import work_parser as wp +@asynccontextmanager +async def lifespan(app: FastAPI): + """Управление жизненным циклом приложения""" + # Startup + scheduler.add_job(scheduled_parser_1, "cron", hour=10, minute=0) + scheduler.add_job(scheduled_parser_2, "cron", hour=11, minute=0) + scheduler.start() + yield + # Shutdown + scheduler.shutdown() + app = FastAPI(title="Parser API", description="API для запуска парсинга в базу данных", - version="1.0") + version="1.0", + lifespan=lifespan) # Инициализация планировщика scheduler = AsyncIOScheduler() @@ -31,7 +51,7 @@ logger = logging.getLogger(__name__) @app.get("/logs") def get_logs(): with open("app.log", "r") as file: - lines = file.readlines()[-10:] # последние 100 строк + lines = file.readlines()[-10:] # последние 10 строк return {"logs": lines} # Инициализация таблицы статуса парсинга @@ -45,10 +65,8 @@ app.add_middleware( allow_headers=["*"], ) - PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt" - def download_proxies(url): response = requests.get(url) if response.status_code == 200: @@ -57,7 +75,6 @@ def download_proxies(url): else: return [] - def fetch_with_proxy(url, proxy, verify, timeout): proxies = { 'http': f'http://{proxy}', # или 'socks5://' если SOCKS5 и т.п. @@ -71,7 +88,6 @@ def fetch_with_proxy(url, proxy, verify, timeout): except: return None - # Общие функции нахождения ссылок def extract_map_area_hrefs(url, verify=True, ist_number=1): headers = { @@ -102,7 +118,6 @@ def extract_map_area_hrefs(url, verify=True, ist_number=1): # функции парсера первого источника (газета) def extract_text_from_url_one(url, timeout=10, verify=True): - proxies_list = download_proxies(PROXIES_URL) response = "" @@ -115,7 +130,6 @@ def extract_text_from_url_one(url, timeout=10, verify=True): soup = BeautifulSoup(response, "html.parser") - title_div = soup.find('div', class_='newsdetatit') title_text = '' if title_div: @@ -155,7 +169,7 @@ def extract_text_from_url(url, timeout=10, verify=True): # Находим контейнер div.whitecon.article container = soup.find("div", class_="whitecon article") if not container: - return "" + return "", "" # Получение заголовка