Skip to content

Commit 279f26e

Browse files
zhangpengyunzhangpengyun
authored andcommitted
add more articles
1 parent 018bb70 commit 279f26e

File tree

34,120 files changed

+1270060
-2163
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

34,120 files changed

+1270060
-2163
lines changed

babel.config.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
module.exports = {
2+
presets: ['@babel/preset-react'],
3+
plugins: [
4+
['@babel/plugin-transform-runtime', {
5+
corejs: 3
6+
}]
7+
]
8+
}

data.db

0 Bytes
Binary file not shown.

index.html

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!DOCTYPE html>
2+
<html lang="zh-CN">
3+
<head>
4+
<meta charset="UTF-8" />
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
6+
<title>微信阅读器</title>
7+
</head>
8+
<body>
9+
<div id="root"></div>
10+
<script type="module" src="/src/main.jsx"></script>
11+
</body>
12+
</html>

internal/service/crawler.go

Lines changed: 171 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ import (
44
"bytes"
55
"compress/gzip"
66
"context"
7+
"encoding/json"
78
"fmt"
89
"io"
910
"net/http"
11+
"regexp"
12+
"strconv"
1013
"strings"
1114
"time"
1215

@@ -41,7 +44,8 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
4144

4245
// 设置必要的请求头
4346
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF WindowsWechat(0x6309092b) XWEB/9053")
44-
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
47+
// req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
48+
req.Header.Set("Accept", "text/json")
4549
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
4650
req.Header.Set("Accept-Encoding", "gzip, deflate, br")
4751
req.Header.Set("Connection", "keep-alive")
@@ -116,6 +120,9 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
116120
topic = "未分类" // 设置默认主题
117121
}
118122

123+
msgid := ""
124+
itemidx := 0
125+
119126
// 从 DOM 中提取文章列表
120127
var articles []model.Article
121128
if len(articles) == 0 {
@@ -135,6 +142,18 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
135142
}
136143
title = strings.TrimSpace(title)
137144

145+
// 获取 msgid、itemidx(如果有的话)
146+
msgidT, exexists := s.Attr("data-msgid")
147+
if !exexists {
148+
msgid = ""
149+
}
150+
msgid = msgidT
151+
itemidxT, exexists := s.Attr("data-itemidx")
152+
if !exexists {
153+
itemidx = 0
154+
}
155+
itemidx, _ = strconv.Atoi(itemidxT)
156+
138157
// 获取发布时间(如果有的话)
139158
publishTime := time.Now() // 默认使用当前时间
140159
publishTimeStr := strings.TrimSpace(s.Find(".album__item-info-time").Text())
@@ -184,12 +203,28 @@ func (c *Crawler) FetchArticles(ctx context.Context, subscriptionURL string) ([]
184203
}
185204
}
186205

206+
// 在获取完初始文章后,尝试获取更多文章
207+
if len(articles) > 0 {
208+
// 从 URL 中提取 topic_id
209+
topicID := ""
210+
if matches := regexp.MustCompile(`album_id=([^&]+)`).FindStringSubmatch(subscriptionURL); len(matches) > 1 {
211+
topicID = matches[1]
212+
213+
// 获取更多文章
214+
moreArticles, err := c.fetchMoreArticles(ctx, topicID, topic, msgid, itemidx)
215+
if err != nil {
216+
fmt.Printf("获取更多文章时出错: %v\n", err)
217+
} else {
218+
articles = append(articles, moreArticles...)
219+
}
220+
}
221+
}
222+
187223
// 打印最终结果
188224
fmt.Printf("总共解析到 %d 篇文章\n", len(articles))
189225
for i, article := range articles {
190226
fmt.Printf("文章 %d: %+v\n", i+1, article)
191227
}
192-
193228
return articles, nil
194229
}
195230

@@ -199,3 +234,137 @@ func min(a, b int) int {
199234
}
200235
return b
201236
}
237+
238+
// 在 Crawler struct 定义后添加以下内容
239+
type WeixinResponse struct {
240+
BaseResp struct {
241+
Ret int `json:"ret"`
242+
} `json:"base_resp"`
243+
GetalbumResp struct {
244+
ArticleList []WeixinArticle `json:"article_list"`
245+
ContinueFlag string `json:"continue_flag"`
246+
} `json:"getalbum_resp"`
247+
}
248+
249+
type WeixinArticle struct {
250+
Title string `json:"title"`
251+
URL string `json:"url"`
252+
CoverImg string `json:"cover_img_1_1"`
253+
CreateTime string `json:"create_time"`
254+
Msgid string `json:"msgid"`
255+
Itemidx string `json:"itemidx"`
256+
}
257+
258+
func (c *Crawler) fetchMoreArticles(ctx context.Context, topicID string, topic string, msgid string, itemidex int) ([]model.Article, error) {
259+
var allArticles []model.Article
260+
processedURLs := make(map[string]bool)
261+
262+
nextMsgid := ""
263+
nextItemidx := 0
264+
if msgid != "" {
265+
nextMsgid = msgid
266+
}
267+
if itemidex != 0 {
268+
nextItemidx = itemidex
269+
}
270+
hasMore := true
271+
batchSize := 10
272+
273+
for hasMore {
274+
url := "https://mp.weixin.qq.com/mp/appmsgalbum"
275+
params := map[string]string{
276+
"action": "getalbum",
277+
"album_id": topicID,
278+
"count": fmt.Sprintf("%d", batchSize),
279+
"f": "json",
280+
}
281+
282+
if nextMsgid != "" && nextItemidx > 0 {
283+
params["begin_msgid"] = nextMsgid
284+
params["begin_itemidx"] = fmt.Sprintf("%d", nextItemidx)
285+
}
286+
287+
// 构建请求 URL
288+
reqURL := url + "?"
289+
for k, v := range params {
290+
reqURL += k + "=" + v + "&"
291+
}
292+
reqURL = strings.TrimSuffix(reqURL, "&")
293+
294+
// 创建请求
295+
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
296+
if err != nil {
297+
return nil, fmt.Errorf("创建请求失败: %v", err)
298+
}
299+
300+
req.Header.Set("Accept", "application/json")
301+
req.Header.Set("Referer", fmt.Sprintf("https://mp.weixin.qq.com/mp/appmsgalbum?action=getalbum&album_id=%s", topicID))
302+
303+
// 发送请求
304+
resp, err := c.client.Do(req)
305+
if err != nil {
306+
return nil, fmt.Errorf("请求失败: %v", err)
307+
}
308+
defer resp.Body.Close()
309+
310+
// 解析响应
311+
var result WeixinResponse
312+
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
313+
return nil, fmt.Errorf("解析响应失败: %v", err)
314+
}
315+
316+
if result.BaseResp.Ret != 0 {
317+
return nil, fmt.Errorf("API返回错误码: %d", result.BaseResp.Ret)
318+
}
319+
320+
newArticlesCount := 0
321+
for _, wxArticle := range result.GetalbumResp.ArticleList {
322+
if !processedURLs[wxArticle.URL] {
323+
// 将字符串类型的创建时间转换为int64
324+
createTimeInt, err := strconv.ParseInt(wxArticle.CreateTime, 10, 64)
325+
if err != nil {
326+
// 如果转换失败,使用当前时间作为发布时间
327+
createTimeInt = time.Now().Unix()
328+
}
329+
330+
article := model.Article{
331+
ID: fmt.Sprintf("article_%d_%d", time.Now().Unix(), len(allArticles)),
332+
Title: wxArticle.Title,
333+
URL: wxArticle.URL,
334+
Topic: topic,
335+
PublishTime: time.Unix(createTimeInt, 0),
336+
CreateTime: time.Now(),
337+
}
338+
allArticles = append(allArticles, article)
339+
processedURLs[wxArticle.URL] = true
340+
newArticlesCount++
341+
}
342+
}
343+
344+
if newArticlesCount == 0 || len(result.GetalbumResp.ArticleList) == 0 {
345+
break
346+
}
347+
348+
// 更新下一次请求的参数
349+
lastArticle := result.GetalbumResp.ArticleList[len(result.GetalbumResp.ArticleList)-1]
350+
nextMsgid = lastArticle.Msgid
351+
itemidx, err := strconv.Atoi(lastArticle.Itemidx)
352+
if err != nil {
353+
nextItemidx = 0
354+
} else {
355+
nextItemidx = itemidx
356+
}
357+
358+
cf, err := strconv.ParseInt(result.GetalbumResp.ContinueFlag, 10, 64)
359+
if err != nil {
360+
}
361+
if cf == 0 {
362+
hasMore = false
363+
}
364+
365+
// 添加延时避免被封
366+
time.Sleep(2 * time.Second)
367+
}
368+
369+
return allArticles, nil
370+
}

internal/storage/database.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,19 @@ func NewDatabase(ctx context.Context, dbPath string) (*Database, error) {
1919
return nil, err
2020
}
2121

22-
// 修改这里,不要每次都删除表
22+
// 创建表并添加 URL 唯一索引
2323
_, err = db.ExecContext(ctx, `
2424
CREATE TABLE IF NOT EXISTS articles (
2525
id TEXT PRIMARY KEY,
2626
title TEXT NOT NULL,
2727
author TEXT,
2828
content TEXT,
29-
url TEXT,
29+
url TEXT UNIQUE,
3030
topic TEXT,
3131
publish_time DATETIME,
3232
create_time DATETIME
33-
)
33+
);
34+
CREATE UNIQUE INDEX IF NOT EXISTS idx_articles_url ON articles(url);
3435
`)
3536
if err != nil {
3637
return nil, err
@@ -52,14 +53,19 @@ func (d *Database) SaveArticles(ctx context.Context, articles []model.Article) e
5253

5354
stmt, err := tx.PrepareContext(ctx, `
5455
INSERT OR REPLACE INTO articles (id, title, author, content, url, topic, publish_time, create_time)
55-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
56+
VALUES (?, ?, ?, ?, NULLIF(?, ''), ?, ?, ?)
5657
`)
5758
if err != nil {
5859
return err
5960
}
6061
defer stmt.Close()
6162

6263
for _, article := range articles {
64+
// 跳过没有 URL 的文章
65+
if article.URL == "" {
66+
continue
67+
}
68+
6369
_, err = stmt.ExecContext(ctx,
6470
article.ID,
6571
article.Title,

0 commit comments

Comments
 (0)