@@ -4,9 +4,12 @@ import (
4
4
"bytes"
5
5
"compress/gzip"
6
6
"context"
7
+ "encoding/json"
7
8
"fmt"
8
9
"io"
9
10
"net/http"
11
+ "regexp"
12
+ "strconv"
10
13
"strings"
11
14
"time"
12
15
@@ -41,7 +44,8 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
41
44
42
45
// 设置必要的请求头
43
46
req .Header .Set ("User-Agent" , "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309092b) XWEB/9053" )
44
- req .Header .Set ("Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" )
47
+ // req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
48
+ req .Header .Set ("Accept" , "text/json" )
45
49
req .Header .Set ("Accept-Language" , "zh-CN,zh;q=0.9,en;q=0.8" )
46
50
req .Header .Set ("Accept-Encoding" , "gzip, deflate, br" )
47
51
req .Header .Set ("Connection" , "keep-alive" )
@@ -116,6 +120,9 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
116
120
topic = "未分类" // 设置默认主题
117
121
}
118
122
123
+ msgid := ""
124
+ itemidx := 0
125
+
119
126
// 从 DOM 中提取文章列表
120
127
var articles []model.Article
121
128
if len (articles ) == 0 {
@@ -135,6 +142,18 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
135
142
}
136
143
title = strings .TrimSpace (title )
137
144
145
+ // 获取 msgid、itemidx(如果有的话)
146
+ msgidT , exexists := s .Attr ("data-msgid" )
147
+ if ! exexists {
148
+ msgid = ""
149
+ }
150
+ msgid = msgidT
151
+ itemidxT , exexists := s .Attr ("data-itemidx" )
152
+ if ! exexists {
153
+ itemidx = 0
154
+ }
155
+ itemidx , _ = strconv .Atoi (itemidxT )
156
+
138
157
// 获取发布时间(如果有的话)
139
158
publishTime := time .Now () // 默认使用当前时间
140
159
publishTimeStr := strings .TrimSpace (s .Find (".album__item-info-time" ).Text ())
@@ -184,12 +203,28 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
184
203
}
185
204
}
186
205
206
+ // 在获取完初始文章后,尝试获取更多文章
207
+ if len (articles ) > 0 {
208
+ // 从 URL 中提取 topic_id
209
+ topicID := ""
210
+ if matches := regexp .MustCompile (`album_id=([^&]+)` ).FindStringSubmatch (subscriptionURL ); len (matches ) > 1 {
211
+ topicID = matches [1 ]
212
+
213
+ // 获取更多文章
214
+ moreArticles , err := c .fetchMoreArticles (ctx , topicID , topic , msgid , itemidx )
215
+ if err != nil {
216
+ fmt .Printf ("获取更多文章时出错: %v\n " , err )
217
+ } else {
218
+ articles = append (articles , moreArticles ... )
219
+ }
220
+ }
221
+ }
222
+
187
223
// 打印最终结果
188
224
fmt .Printf ("总共解析到 %d 篇文章\n " , len (articles ))
189
225
for i , article := range articles {
190
226
fmt .Printf ("文章 %d: %+v\n " , i + 1 , article )
191
227
}
192
-
193
228
return articles , nil
194
229
}
195
230
@@ -199,3 +234,137 @@ func min(a, b int) int {
199
234
}
200
235
return b
201
236
}
237
+
238
+ // 在 Crawler struct 定义后添加以下内容
239
+ type WeixinResponse struct {
240
+ BaseResp struct {
241
+ Ret int `json:"ret"`
242
+ } `json:"base_resp"`
243
+ GetalbumResp struct {
244
+ ArticleList []WeixinArticle `json:"article_list"`
245
+ ContinueFlag string `json:"continue_flag"`
246
+ } `json:"getalbum_resp"`
247
+ }
248
+
249
+ type WeixinArticle struct {
250
+ Title string `json:"title"`
251
+ URL string `json:"url"`
252
+ CoverImg string `json:"cover_img_1_1"`
253
+ CreateTime string `json:"create_time"`
254
+ Msgid string `json:"msgid"`
255
+ Itemidx string `json:"itemidx"`
256
+ }
257
+
258
+ func (c * Crawler ) fetchMoreArticles (ctx context.Context , topicID string , topic string , msgid string , itemidex int ) ([]model.Article , error ) {
259
+ var allArticles []model.Article
260
+ processedURLs := make (map [string ]bool )
261
+
262
+ nextMsgid := ""
263
+ nextItemidx := 0
264
+ if msgid != "" {
265
+ nextMsgid = msgid
266
+ }
267
+ if itemidex != 0 {
268
+ nextItemidx = itemidex
269
+ }
270
+ hasMore := true
271
+ batchSize := 10
272
+
273
+ for hasMore {
274
+ url := "https://mp.weixin.qq.com/mp/appmsgalbum"
275
+ params := map [string ]string {
276
+ "action" : "getalbum" ,
277
+ "album_id" : topicID ,
278
+ "count" : fmt .Sprintf ("%d" , batchSize ),
279
+ "f" : "json" ,
280
+ }
281
+
282
+ if nextMsgid != "" && nextItemidx > 0 {
283
+ params ["begin_msgid" ] = nextMsgid
284
+ params ["begin_itemidx" ] = fmt .Sprintf ("%d" , nextItemidx )
285
+ }
286
+
287
+ // 构建请求 URL
288
+ reqURL := url + "?"
289
+ for k , v := range params {
290
+ reqURL += k + "=" + v + "&"
291
+ }
292
+ reqURL = strings .TrimSuffix (reqURL , "&" )
293
+
294
+ // 创建请求
295
+ req , err := http .NewRequestWithContext (ctx , "GET" , reqURL , nil )
296
+ if err != nil {
297
+ return nil , fmt .Errorf ("创建请求失败: %v" , err )
298
+ }
299
+
300
+ req .Header .Set ("Accept" , "application/json" )
301
+ req .Header .Set ("Referer" , fmt .Sprintf ("https://mp.weixin.qq.com/mp/appmsgalbum?action=getalbum&album_id=%s" , topicID ))
302
+
303
+ // 发送请求
304
+ resp , err := c .client .Do (req )
305
+ if err != nil {
306
+ return nil , fmt .Errorf ("请求失败: %v" , err )
307
+ }
308
+ defer resp .Body .Close ()
309
+
310
+ // 解析响应
311
+ var result WeixinResponse
312
+ if err := json .NewDecoder (resp .Body ).Decode (& result ); err != nil {
313
+ return nil , fmt .Errorf ("解析响应失败: %v" , err )
314
+ }
315
+
316
+ if result .BaseResp .Ret != 0 {
317
+ return nil , fmt .Errorf ("API返回错误码: %d" , result .BaseResp .Ret )
318
+ }
319
+
320
+ newArticlesCount := 0
321
+ for _ , wxArticle := range result .GetalbumResp .ArticleList {
322
+ if ! processedURLs [wxArticle .URL ] {
323
+ // 将字符串类型的创建时间转换为int64
324
+ createTimeInt , err := strconv .ParseInt (wxArticle .CreateTime , 10 , 64 )
325
+ if err != nil {
326
+ // 如果转换失败,使用当前时间作为发布时间
327
+ createTimeInt = time .Now ().Unix ()
328
+ }
329
+
330
+ article := model.Article {
331
+ ID : fmt .Sprintf ("article_%d_%d" , time .Now ().Unix (), len (allArticles )),
332
+ Title : wxArticle .Title ,
333
+ URL : wxArticle .URL ,
334
+ Topic : topic ,
335
+ PublishTime : time .Unix (createTimeInt , 0 ),
336
+ CreateTime : time .Now (),
337
+ }
338
+ allArticles = append (allArticles , article )
339
+ processedURLs [wxArticle .URL ] = true
340
+ newArticlesCount ++
341
+ }
342
+ }
343
+
344
+ if newArticlesCount == 0 || len (result .GetalbumResp .ArticleList ) == 0 {
345
+ break
346
+ }
347
+
348
+ // 更新下一次请求的参数
349
+ lastArticle := result .GetalbumResp .ArticleList [len (result .GetalbumResp .ArticleList )- 1 ]
350
+ nextMsgid = lastArticle .Msgid
351
+ itemidx , err := strconv .Atoi (lastArticle .Itemidx )
352
+ if err != nil {
353
+ nextItemidx = 0
354
+ } else {
355
+ nextItemidx = itemidx
356
+ }
357
+
358
+ cf , err := strconv .ParseInt (result .GetalbumResp .ContinueFlag , 10 , 64 )
359
+ if err != nil {
360
+ }
361
+ if cf == 0 {
362
+ hasMore = false
363
+ }
364
+
365
+ // 添加延时避免被封
366
+ time .Sleep (2 * time .Second )
367
+ }
368
+
369
+ return allArticles , nil
370
+ }
0 commit comments