from fastapi import FastAPI, Request, BackgroundTasks, Query from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import logging import subprocess import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import json from datetime import datetime as dt import uvicorn import time from datetime import datetime import settings_work as sw import work_parser as wp app = FastAPI(title="Parser API", description="API для запуска парсинга в базу данных", version="1.0") # Настройка логгера logging.basicConfig(filename="app.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) @app.get("/logs") def get_logs(): with open("app.log", "r") as file: lines = file.readlines()[-10:] # последние 100 строк return {"logs": lines} # Инициализация таблицы статуса парсинга wp.create_table() app.add_middleware( CORSMiddleware, allow_origins=["*"], # или список разрешенных адресов, например ["http://localhost:8080"] allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt" def download_proxies(url): response = requests.get(url) if response.status_code == 200: proxies = response.text.splitlines() # список прокси по строкам return proxies else: return [] def fetch_with_proxy(url, proxy, verify, timeout): proxies = { 'http': f'http://{proxy}', # или 'socks5://' если SOCKS5 и т.п. 'https': f'http://{proxy}', } try: response = requests.get(url, proxies=proxies, timeout=timeout, verify=verify) response.encoding = 'utf-8' response.raise_for_status() return response.text except: return None # Общие функции нахождения ссылок def extract_map_area_hrefs(url, verify=True, ist_number=1): headers = { "User-Agent": "Mozilla/5.0 (compatible; MyScraper/1.0; +https://example.com)" } resp = requests.get(url, headers=headers, verify=verify) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") hrefs = [] if ist_number == 1: for map_tag in soup.find_all("li", attrs={"data-page": "1"}): for a in map_tag.find_all("a", href=True): href = a["href"] abs_url = urljoin(url, href) print(abs_url) hrefs.append(abs_url) else: for map_tag in soup.find_all("map"): for area in map_tag.find_all("area", href=True): href = area["href"] abs_url = urljoin(url, href) hrefs.append(abs_url) return hrefs # функции парсера первого источника (газета) def extract_text_from_url_one(url, timeout=10, verify=True): proxies_list = download_proxies(PROXIES_URL) response = "" for proxy in proxies_list: response = fetch_with_proxy(url, proxy=proxy, timeout=timeout, verify=verify) if response: break else: response = "" soup = BeautifulSoup(response, "html.parser") title_div = soup.find('div', class_='newsdetatit') title_text = '' if title_div: h3_tag = title_div.find('h3') if h3_tag: title_text = h3_tag.get_text(strip=True) content_div = soup.find('div', class_='newsdetatext') content_text = '' if content_div: founder_content = content_div.find('founder-content') if founder_content: p_tags = founder_content.find_all('p') content_text = '\n'.join(p.get_text(strip=True) for p in p_tags) text = title_text + content_text if len(text) > 4500: text = text[:4500] print(len(text)) return text #Функции парсера второго источника (военного) def extract_text_from_url(url, timeout=10, verify=True): proxies_list = download_proxies(PROXIES_URL) response = "" for proxy in proxies_list: response = fetch_with_proxy(url, proxy=proxy, timeout=timeout, verify=verify) if response: break else: response = "" soup = BeautifulSoup(response, 'html.parser') # Находим контейнер div.whitecon.article container = soup.find("div", class_="whitecon article") if not container: return "" # Получение заголовка