4
4
from sqlmodel import create_engine , Session , select
5
5
from datetime import datetime , timezone , timedelta
6
6
from loguru import logger
7
- from redis import Redis
8
- from rq import Queue
7
+ from redis import Redis # type: ignore
8
+ from rq import Queue , Retry
9
9
10
10
from app .models .article import Article
11
11
from app .models .feed import Feed , parse_feed , generate_feed
25
25
gpu_queue = Queue ("gpu" , connection = redis_conn )
26
26
27
27
28
- def fetch_feed (feed_id ):
29
- with Session (ENGINE ) as session :
30
- feed = session .get (Feed , feed_id )
31
- if not feed :
32
- logger .error (f"Feed { feed_id } not found" )
33
- return
34
-
35
- try :
36
- parsed_feed = asyncio .run (parse_feed (feed .url ))
37
- for article in parsed_feed .articles :
38
- existing_article = session .exec (
39
- select (Article ).where (Article .url == article .url )
40
- ).first ()
41
- if not existing_article :
42
- article .feed = feed
43
- session .add (article )
44
- session .commit ()
45
- gpu_queue .enqueue (compute_article_embedding , article .id )
46
-
47
- logger .info (
48
- f"Fetched { len (parsed_feed .articles )} articles for feed { feed_id } "
49
- )
50
- except Exception as e :
51
- logger .error (f"Error fetching feed { feed_id } : { e } " )
52
-
53
-
54
28
def compute_article_embedding (article_id ):
55
29
with Session (ENGINE ) as session :
56
30
article = session .get (Article , article_id )
@@ -87,7 +61,7 @@ def remove_old_embeddings():
87
61
old_articles = session .exec (
88
62
select (Article )
89
63
.where (Article .updated < one_month_ago )
90
- .where (Article .embedding is not None )
64
+ .where (Article .embedding != None ) # noqa: E711
91
65
).all ()
92
66
93
67
for article in old_articles :
@@ -97,12 +71,86 @@ def remove_old_embeddings():
97
71
logger .info (f"Removed embeddings from { len (old_articles )} old articles" )
98
72
99
73
74
+ BATCH_SIZE = int (os .getenv ("FEED_FETCH_BATCH_SIZE" , "50" ))
75
+
76
+
77
+ async def fetch_feed_batch (feed_ids ):
78
+ async def fetch_single_feed (feed ):
79
+ try :
80
+ return await parse_feed (feed .url )
81
+ except Exception as e :
82
+ logger .error (f"Error fetching feed { feed .id } : { e } " )
83
+ return None
84
+
85
+ with Session (ENGINE ) as session :
86
+ feeds = session .exec (select (Feed ).where (Feed .id .in_ (feed_ids ))).all ()
87
+ tasks = [asyncio .create_task (fetch_single_feed (feed )) for feed in feeds ]
88
+ results = await asyncio .gather (* tasks )
89
+
90
+ new_articles = []
91
+ for feed , parsed_feed in zip (feeds , results ):
92
+ if parsed_feed is None :
93
+ continue
94
+
95
+ for article in parsed_feed .articles :
96
+ existing_article = session .exec (
97
+ select (Article ).where (Article .url == article .url )
98
+ ).first ()
99
+ if not existing_article :
100
+ article .feed = feed
101
+ session .add (article )
102
+ new_articles .append (article )
103
+
104
+ feed .updated_at = datetime .now (timezone .utc )
105
+
106
+ session .commit ()
107
+
108
+ if new_articles :
109
+ new_article_ids = [article .id for article in new_articles ]
110
+ enqueue_gpu_task (compute_embeddings_batch , new_article_ids )
111
+
112
+ logger .info (
113
+ f"Fetched { len (feeds )} feeds, added { len (new_articles )} new articles"
114
+ )
115
+
116
+
117
+ def compute_embeddings_batch (article_ids ):
118
+ with Session (ENGINE ) as session :
119
+ articles = session .exec (
120
+ select (Article ).where (Article .id .in_ (article_ids ))
121
+ ).all ()
122
+ articles_to_embed = [
123
+ article for article in articles if article .embedding is None
124
+ ]
125
+
126
+ if not articles_to_embed :
127
+ return
128
+
129
+ try :
130
+ compute_embeddings (articles_to_embed )
131
+ session .commit ()
132
+ logger .info (f"Computed embeddings for { len (articles_to_embed )} articles" )
133
+ except Exception as e :
134
+ logger .error (f"Error computing embeddings for articles: { e } " )
135
+
136
+
100
137
def fetch_all_feeds ():
101
138
with Session (ENGINE ) as session :
102
- feeds = session .exec (select (Feed )).all ()
103
- for feed in feeds :
104
- low_queue .enqueue (fetch_feed , feed .id )
105
- logger .info (f"Enqueued fetch tasks for { len (feeds )} feeds" )
139
+ one_month_ago = datetime .now (timezone .utc ) - timedelta (days = 30 )
140
+ active_feeds = session .exec (
141
+ select (Feed )
142
+ .join (User .feeds )
143
+ .where (User .last_request > one_month_ago )
144
+ .distinct ()
145
+ ).all ()
146
+
147
+ for i in range (0 , len (active_feeds ), BATCH_SIZE ):
148
+ batch = active_feeds [i : i + BATCH_SIZE ]
149
+ asyncio .run (fetch_feed_batch ([feed .id for feed in batch ]))
150
+
151
+ logger .info (
152
+ f"Processed { len (active_feeds )} active feeds in batches of { BATCH_SIZE } "
153
+ )
106
154
107
155
108
156
def log_user_action (user_id : str , article_id : int , link_url : str ):
@@ -123,7 +171,7 @@ def log_user_action(user_id: str, article_id: int, link_url: str):
123
171
if article not in user .articles :
124
172
user .articles .append (article )
125
173
session .add (user )
126
- medium_queue . enqueue (recompute_user_clusters , user .id )
174
+ enqueue_medium_priority (recompute_user_clusters , user .id )
127
175
128
176
session .commit ()
129
177
logger .info (f"Logged action for user { user_id } , article { article_id } " )
@@ -151,12 +199,24 @@ def generate_filtered_feed(feed_id: int, user_id: str):
151
199
152
200
# Helper functions to enqueue tasks
153
201
def enqueue_low_priority (func , * args , ** kwargs ):
154
- return low_queue .enqueue (func , * args , ** kwargs )
202
+ return low_queue .enqueue (
203
+ func , args = args , kwargs = kwargs , retry = Retry (max = 3 ), timeout = 300
204
+ )
155
205
156
206
157
207
def enqueue_medium_priority (func , * args , ** kwargs ):
158
- return medium_queue .enqueue (func , * args , ** kwargs )
208
+ return medium_queue .enqueue (
209
+ func , args = args , kwargs = kwargs , retry = Retry (max = 3 ), timeout = 300
210
+ )
159
211
160
212
161
213
def enqueue_high_priority (func , * args , ** kwargs ):
162
- return high_queue .enqueue (func , * args , ** kwargs )
214
+ return high_queue .enqueue (
215
+ func , args = args , kwargs = kwargs , retry = Retry (max = 2 ), timeout = 10
216
+ )
217
+
218
+
219
+ def enqueue_gpu_task (func , * args , ** kwargs ):
220
+ return gpu_queue .enqueue (
221
+ func , args = args , kwargs = kwargs , retry = Retry (max = 3 ), timeout = 600
222
+ )
0 commit comments