1
1
import time
2
- from turtle import st
3
2
import requests
4
3
from bs4 import BeautifulSoup
5
4
import json
6
5
import re
6
+ import logging
7
7
from .openai_extract import openai_extract
8
+ from .get_screenshot import get_screenshot
8
9
def replace_multiple_spaces_with_single (s ):
9
10
return re .sub (r'\s+' , ' ' , s )
10
11
@@ -20,7 +21,7 @@ def check(url):
20
21
else :
21
22
return None
22
23
23
- def craw_single_page (url ,use_template = False ,use_ai = True ):
24
+ def craw_single_page (url ,use_template = False ,use_ai = False ):
24
25
url = check (url )
25
26
if not url :
26
27
return [],[],0
@@ -60,28 +61,41 @@ def craw_single_page(url,use_template=False,use_ai=True):
60
61
body_content = soup .body .get_text () if soup .body else "N/A"
61
62
body_content = replace_multiple_spaces_with_single (body_content )
62
63
# print(body_content)
64
+ if use_ai :
63
65
# 使用AI处理
64
- token_usage = 0
65
- meta_info = f"标题:{ title } ,关键词:{ keywords } ,简介:{ description } "
66
- ai_extract_content ,t = openai_extract (meta_info ,body_content )
67
- token_usage += t
68
- # 判断json是否规范
69
- status = True
70
- time_out = 0
71
- while status and time_out < 5 :
72
- try :
73
- json .dumps (ai_extract_content )
74
- status = False
75
- except :
76
- ai_extract_content ,t = openai_extract (meta_info ,body_content )
77
- token_usage += t
78
- time_out += 1
79
-
66
+ token_usage = 0
67
+ meta_info = f"标题:{ title } ,关键词:{ keywords } ,简介:{ description } "
68
+ ai_extract_content ,t = openai_extract (meta_info ,body_content )
69
+ token_usage += t
70
+ # 判断json是否规范
71
+ status = True
72
+ time_out = 0
73
+ while status and time_out < 5 :
74
+ try :
75
+ json .dumps (ai_extract_content )
76
+ status = False
77
+ except :
78
+ ai_extract_content ,t = openai_extract (meta_info ,body_content )
79
+ token_usage += t
80
+ time_out += 1
80
81
81
- print (ai_extract_content )
82
- print ("🍎" * 50 )
82
+
83
+ print (ai_extract_content )
84
+ print ("🍎" * 50 )
85
+ else :
86
+ ai_extract_content = {}
87
+ token_usage = 0
88
+
89
+ # 提取网页截图
90
+ # TODO: 多线程处理
91
+ try :
92
+ screenshot_path = get_screenshot (url )
93
+ except Exception as e :
94
+ logging .error (f"Error taking screenshot: { url } \n { e } " )
95
+ screenshot_path = None
96
+ logging .info (f"Screenshot saved to: { screenshot_path } " )
83
97
# 结算
84
- page_res = {"title" : title ,"url" :url , "keywords" : keywords , "description" : description , "body" : body_content ,"ai_extract_content" :ai_extract_content }
98
+ page_res = {"title" : title ,"url" :url , "keywords" : keywords , "description" : description , "body" : body_content ,"ai_extract_content" :ai_extract_content , "screenshot_path" : screenshot_path }
85
99
json_str = json .dumps (page_res , ensure_ascii = False )
86
100
return page_res ,json_str ,token_usage
87
101
0 commit comments