Compare commits

...

17 Commits

Author SHA1 Message Date
14f23d32a6 Обновить main.py
Some checks failed
continuous-integration/drone/push Build is failing
continuous-integration/drone Build is passing
test
2026-04-14 05:28:31 +00:00
cc18b98946 парсер всех источников
Some checks failed
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is failing
2026-04-12 19:11:49 +10:00
4405400715 все источники
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 22:16:33 +10:00
662cfa0994 sourse
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:33:43 +10:00
89da884e66 expose_headers=["*"],
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:20:57 +10:00
42157e67e1 исправил ошибки
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 21:14:25 +10:00
15f637eb33 async
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 20:35:02 +10:00
be7b042e7c перевел chek_url в базу данных проекта, то бы не взаимодействовать со сторонним сервисом
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 20:27:48 +10:00
9fd823d8d1 еще
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 16:04:39 +10:00
5c0cdd03e7 добавлена проверка
Some checks failed
continuous-integration/drone/push Build was killed
2026-04-11 16:03:53 +10:00
4098ac7d8d может так
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 15:52:18 +10:00
5805ab0fe2 исправил url_ist
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 15:39:11 +10:00
f39ade5245 ytljxtns
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone Build is passing
2026-04-11 14:10:05 +10:00
bf8bc173a1 доработка
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 14:07:01 +10:00
38ec470b67 поправил правила хранения промтов
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 14:06:12 +10:00
7e05dda3cb изменил старт первого источника
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 13:50:51 +10:00
4720aa05bc убрал закрытия подключения
All checks were successful
continuous-integration/drone/push Build is passing
2026-04-11 13:11:42 +10:00
2 changed files with 130 additions and 87 deletions

71
main.py
View File

@@ -64,6 +64,7 @@ app.add_middleware(
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
# expose_headers=["*"],
)
PROXIES_URL = "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt"
@@ -232,15 +233,25 @@ def gpt_response_message(content: str, name_promt: str):
# Общие функции проверки ссылок
def check_url(url):
print(url)
response = requests.get('http://45.129.78.228:8002/check_url_exists', params={'url': url})
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
print(f"Ошибка: {response.status_code}")
try:
response = wp.check_url_exists(url) #get('http://45.129.78.228:8002/check_url_exists', params={'url': url})
if response.status_code == 200:
result = response.json()
print(result["exists"])
return result["exists"]
else:
print(f"Ошибка: {response.status_code}")
# Если сервис недоступен — считаем, что URL новый (пропускаем)
return False
except requests.exceptions.Timeout:
print(f"Timeout при проверке URL: {url}")
logger.warning(f"check_url timeout: {url}")
# Если таймаут — считаем, что URL новый (пропускаем)
return False
except Exception as e:
print(f"Ошибка при проверке URL: {e}")
logger.error(f"check_url error: {e}")
# Если ошибка — считаем, что URL новый (пропускаем)
return False
# функции даты первого источника (газета)
@@ -305,9 +316,9 @@ def update_bd_and_create_document(response_text, article_date, url, parsed_at, o
#Функции start первого источника (газета)
def start_pars_one_istochnik(data_init=""):
if data_init != ['']:
current_day = data_init[0]
current_day = data_init[2]
current_month = data_init[1]
current_year = data_init[2]
current_year = data_init[0]
else:
datetime_now = dt.now()
current_day = create_folder(datetime_now.day)
@@ -336,7 +347,7 @@ def start_pars_one_istochnik(data_init=""):
print(f"Страница {page_number} [{i}/{len(hrefs)}] parsing {link}")
text = extract_text_from_url_one(link)
if len(text) >= 100:
response_text = gpt_response_message(text, url_ist = "http://epaper.hljnews.cn/hljrb/pc/layout")
response_text = gpt_response_message(text, "source1")
print(response_text)
if response_text:
update_bd_and_create_document(response_text=response_text, article_date=f"{current_year}/{current_month}/{current_day}", url=link, parsed_at=str(dt.now()), original_text=text, other=url)
@@ -364,7 +375,7 @@ def start_pars_two_istochnik():
try:
text, time_text = extract_text_from_url(hrefs)
if len(text) >= 100:
response_text = gpt_response_message(text, url_ist = "https://def.ltn.com.tw/breakingnewslist")
response_text = gpt_response_message(text, "source2")
print(response_text)
if response_text:
update_bd_and_create_document(response_text=response_text, article_date=time_text, url=hrefs, parsed_at=str(dt.now()), original_text=text, other=url)
@@ -375,6 +386,7 @@ def start_pars_two_istochnik():
#Функции start любого источника
def start_pars_all_istochnik(url:str, promt:str):
# print(f"Начало парсинга: {url} с промтом: {promt}")
task_id = wp.insert_task(status='queued', source_url=url)
try:
@@ -447,7 +459,7 @@ class ParserOneRequest(BaseModel):
@app.post("/parser_1", summary="Запуск процесса парсинга первого источника")
async def process_parser_one_ist(data: ParserOneRequest, background_tasks: BackgroundTasks):
istochnik = data.time.split("-")
background_tasks.add_task(start_pars_one_istochnik(istochnik))
background_tasks.add_task(start_pars_one_istochnik, istochnik)
return {"message": "Процесс парсинга 1 источника запущен"}
@app.post("/parser_2" , summary="Запуск процеса парсинга второго источника")
@@ -461,39 +473,42 @@ class Parserall(BaseModel):
@app.post("/add_sources" , summary="Добавление парсинга любого источника")
async def add_sources_all_ist(sources: Parserall):
return wp.add_sources(sources.url, sources.promt)
result = wp.add_sources(str(sources.url), sources.promt)
return {"status": "success", "message": "Источник добавлен", "data": result}
@app.get("/all_sources", summary="Метод получения всех источников")
async def get_all_sources():
return wp.get_all_sources()
@app.post("/parser_all" , summary="Запуск процеса парсинга любого источника")
async def process_parser_all_ist(url: Parserall, background_tasks: BackgroundTasks):
background_tasks.add_task(start_pars_all_istochnik(str(url.url), url.promt))
background_tasks.add_task(start_pars_all_istochnik, str(url.url), url.promt)
return {"message": "Процесс парсинга любого источника запущен"}
# GET метод для получения
@app.get("/get_tasks_offset", summary="Метод получения задач парсинга")
def get_tasks_offset(limit: int = Query(10, gt=0), offset: int = Query(0, ge=0)):
async def get_tasks_offset(limit: int = Query(10, gt=0), offset: int = Query(0, ge=0)):
return wp.get_tasks_offset(limit, offset)
# GET метод для получения настроек
@app.get("/settings", summary="Метод получения настроек парсера")
def get_settings():
async def get_settings():
return wp.get_all_promt()
@app.get("/categories_promt", summary="Метод получения categories_promt")
def get_categories_promt():
async def get_categories_promt():
return wp.get_all_categories_promt()
class Source(BaseModel):
url: str
name: str
promt: str
# POST метод для установки настроек
@app.post("/settings", summary="Метод сохранения настроек парсера")
def set_settings(settings: Source):
return wp.update_promt(settings.url, settings.name, settings.promt)
async def set_settings(settings: Source):
return wp.update_promt(settings.name, settings.promt)
@app.delete("/delete_task/{task_id}", summary="Метод удаления задачи")
def delete_task(task_id: int):
async def delete_task(task_id: int):
return print(wp.delete_task(task_id))
@app.get("/file_download", summary="Метод для скачивания файла")
@@ -598,10 +613,10 @@ async def download_all(dates: DownloadRange, background_tasks: BackgroundTasks):
return response
@app.get("/logs", summary="Показать логи")
def get_logs():
async def get_logs():
with open("app.log", "r") as file:
lines = file.readlines()[-10:] # последние 10 строк
return {"logs": lines}
if __name__ == "__main__":
uvicorn.run("main:app", port=8001, reload=True)
# if __name__ == "__main__":
# uvicorn.run("main:app", port=8001, reload=True)

View File

@@ -1,26 +1,46 @@
import psycopg2
from psycopg2.extras import RealDictCursor
# Подключение к БД (укажи свои параметры)
conn = psycopg2.connect(
dbname="parsed_url",
user="postgres",
password="qwertyqwerty123123",
host="45.129.78.228",
# host="127.0.0.1"
connect_timeout=10,
options="-c statement_timeout=30000" # таймаут запроса 30 сек
)
conn.autocommit = True
# Параметры подключения к БД
DB_CONFIG = {
"dbname": "parsed_url",
"user": "postgres",
"password": "qwertyqwerty123123",
"host": "45.129.78.228",
"connect_timeout": 10,
"options": "-c statement_timeout=30000"
}
def close_connection():
"""Закрывает подключение к БД"""
# Глобальное подключение к БД
conn = None
def get_connection():
"""Получает подключение к БД, создавая новое при необходимости"""
global conn
if conn:
conn.close()
conn = None
if conn is None or conn.closed:
conn = psycopg2.connect(**DB_CONFIG)
conn.autocommit = True
return conn
# Проверяет, есть ли указанный URL в базе данных.
def check_url_exists(url: str):
conn = get_connection()
try:
conn = get_connection()
with conn.cursor() as cursor:
cursor.execute(
"SELECT 1 FROM url WHERE url = %s LIMIT 1",
(url,)
)
result = cursor.fetchone()
return {"exists": bool(result)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Ошибка при проверке: {e}")
finally:
pass
# работа с базой данных показывания задач work_parser
def create_table():
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
@@ -38,10 +58,10 @@ def create_table():
""")
print("Таблица work_parser создана или уже существует")
finally:
if conn:
conn.close()
pass
def insert_task(status, source_url=None, source_id=None, priority=0):
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
@@ -52,10 +72,10 @@ def insert_task(status, source_url=None, source_id=None, priority=0):
task_id = cur.fetchone()[0]
return task_id
finally:
if conn:
conn.close()
pass
def get_tasks_offset(limit, offset):
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
@@ -66,10 +86,10 @@ def get_tasks_offset(limit, offset):
tasks = cur.fetchall()
return tasks
finally:
if conn:
conn.close()
pass
def delete_task(task_id: int):
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("DELETE FROM work_parser WHERE id = %s RETURNING *;", (task_id,))
@@ -79,10 +99,10 @@ def delete_task(task_id: int):
else:
return {"message": f"Задача с id {task_id} не найдена"}
finally:
if conn:
conn.close()
pass
def update_task(task_id, **fields):
conn = get_connection()
try:
allowed_fields = ['status', 'started_at', 'finished_at', 'source_url', 'error_message', 'attempts', 'priority']
set_parts = []
@@ -99,64 +119,62 @@ def update_task(task_id, **fields):
cur.execute(f"UPDATE work_parser SET {set_sql} WHERE id = %s;", values)
return True
finally:
if conn:
conn.close()
pass
# Создание и работа с таблицей по созданию и редактированию промтов
def create_table_config_gpt():
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS config_gpt (
url TEXT PRIMARY KEY,
name VARCHAR(20),
name VARCHAR(20) PRIMARY KEY,
promt TEXT
);
""")
print("Таблица config_gpt создана или уже существует")
finally:
if conn:
conn.close()
pass
def update_promt(url: str, name: str, promt: str):
def update_promt(name: str, promt: str):
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO config_gpt (url, name, promt)
VALUES (%s, %s, %s)
ON CONFLICT (url) DO UPDATE SET
name = EXCLUDED.name,
INSERT INTO config_gpt (name, promt)
VALUES ( %s, %s)
ON CONFLICT (name) DO UPDATE SET
promt = EXCLUDED.promt
""", (url, name, promt))
""", (name, promt))
conn.commit()
finally:
if conn:
conn.close()
pass
def get_promt(promt_name_url):
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT promt FROM config_gpt WHERE url = %s", (promt_name_url,))
cur.execute("SELECT promt FROM config_gpt WHERE name = %s", (promt_name_url,))
promt = cur.fetchone()
return promt['promt']
finally:
if conn:
conn.close()
pass
def get_all_promt():
# Возвращает список всех значений поля name из таблицы config_gpt
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT * FROM config_gpt")
rows = cur.fetchall()
sources = [{"url": row["url"], "name": row["name"], "promt": row["promt"]} for row in rows]
sources = [{"name": row["name"], "promt": row["promt"]} for row in rows]
return {"sources": sources}
finally:
if conn:
conn.close()
pass
def get_all_categories_promt():
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT name FROM config_gpt")
@@ -164,11 +182,11 @@ def get_all_categories_promt():
return [row["name"] for row in rows]
finally:
if conn:
conn.close()
pass
# Создание, сохранение и работа с таблицей ошибочных ссылок (error_url)
def create_table_error_url():
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
@@ -180,10 +198,10 @@ def create_table_error_url():
""")
print("Таблица error_url создана или уже существует")
finally:
if conn:
conn.close()
pass
def add_error_url(source_url: str, error_sources_url: str):
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
@@ -193,10 +211,10 @@ def add_error_url(source_url: str, error_sources_url: str):
""", (source_url, error_sources_url))
return cur.fetchone()[0]
finally:
if conn:
conn.close()
pass
def check_error_url(error_sources_url: str) -> bool:
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
@@ -208,11 +226,11 @@ def check_error_url(error_sources_url: str) -> bool:
return row is None
finally:
if conn:
conn.close()
pass
# Создание и работа с таблицей источников add_sources
# Создание и работа с таблицей источников sources
def create_table_add_sourse():
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
@@ -223,23 +241,33 @@ def create_table_add_sourse():
""")
print("Таблица sourse создана или уже существует")
finally:
if conn:
conn.close()
pass
def add_sources(url: str, promt: str):
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO config_gpt (url, promt)
INSERT INTO sourse (url, promt)
VALUES (%s, %s)
ON CONFLICT (url) DO UPDATE SET
promt = EXCLUDED.promt
""", (url, promt))
conn.commit()
finally:
if conn:
conn.close()
pass
def get_all_sources():
"""Возвращает все записи из таблицы sourse"""
conn = get_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("SELECT * FROM sourse")
rows = cur.fetchall()
sources = [{"url": row["url"], "promt": row["promt"]} for row in rows]
return {"sources": sources}
finally:
pass
# Пример использования
# if __name__ == "__main__":
# # create_table_config_gpt() # <-- раскомментировать эту строку