Skip to content

Commit e7d2249

Browse files
committed
批量截图 & AI使用开关
1 parent 40edb5b commit e7d2249

File tree

2 files changed

+40
-21
lines changed

2 files changed

+40
-21
lines changed

main.py

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
import logging
2+
import colorlog
3+
import selenium
4+
logger = logging.getLogger(__name__)
5+
logging.basicConfig(level=logging.INFO)
16
import json
27
import config
38
from utils.crawer import phar_sitemap_url, integrate_sitemap_with_details

utils/crawer/craw_single_page.py

+35-21
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import time
2-
from turtle import st
32
import requests
43
from bs4 import BeautifulSoup
54
import json
65
import re
6+
import logging
77
from .openai_extract import openai_extract
8+
from .get_screenshot import get_screenshot
89
def replace_multiple_spaces_with_single(s):
910
return re.sub(r'\s+', ' ', s)
1011

@@ -20,7 +21,7 @@ def check(url):
2021
else:
2122
return None
2223

23-
def craw_single_page(url,use_template=False,use_ai=True):
24+
def craw_single_page(url,use_template=False,use_ai=False):
2425
url = check(url)
2526
if not url:
2627
return [],[],0
@@ -60,28 +61,41 @@ def craw_single_page(url,use_template=False,use_ai=True):
6061
body_content = soup.body.get_text() if soup.body else "N/A"
6162
body_content = replace_multiple_spaces_with_single(body_content)
6263
# print(body_content)
64+
if use_ai:
6365
# 使用AI处理
64-
token_usage = 0
65-
meta_info = f"标题:{title},关键词:{keywords},简介:{description}"
66-
ai_extract_content,t = openai_extract(meta_info,body_content)
67-
token_usage += t
68-
# 判断json是否规范
69-
status = True
70-
time_out = 0
71-
while status and time_out<5:
72-
try:
73-
json.dumps(ai_extract_content)
74-
status = False
75-
except:
76-
ai_extract_content,t = openai_extract(meta_info,body_content)
77-
token_usage += t
78-
time_out += 1
79-
66+
token_usage = 0
67+
meta_info = f"标题:{title},关键词:{keywords},简介:{description}"
68+
ai_extract_content,t = openai_extract(meta_info,body_content)
69+
token_usage += t
70+
# 判断json是否规范
71+
status = True
72+
time_out = 0
73+
while status and time_out<5:
74+
try:
75+
json.dumps(ai_extract_content)
76+
status = False
77+
except:
78+
ai_extract_content,t = openai_extract(meta_info,body_content)
79+
token_usage += t
80+
time_out += 1
8081

81-
print(ai_extract_content)
82-
print("🍎"*50)
82+
83+
print(ai_extract_content)
84+
print("🍎"*50)
85+
else:
86+
ai_extract_content = {}
87+
token_usage = 0
88+
89+
# 提取网页截图
90+
# TODO: 多线程处理
91+
try:
92+
screenshot_path = get_screenshot(url)
93+
except Exception as e:
94+
logging.error(f"Error taking screenshot: {url}\n {e}")
95+
screenshot_path = None
96+
logging.info(f"Screenshot saved to: {screenshot_path}")
8397
# 结算
84-
page_res = {"title": title,"url":url, "keywords": keywords, "description": description, "body": body_content,"ai_extract_content":ai_extract_content}
98+
page_res = {"title": title,"url":url, "keywords": keywords, "description": description, "body": body_content,"ai_extract_content":ai_extract_content,"screenshot_path":screenshot_path}
8599
json_str = json.dumps(page_res, ensure_ascii=False)
86100
return page_res,json_str,token_usage
87101

0 commit comments

Comments
 (0)