Skip to content

Commit cb1db53

Browse files
RanKKIxuweidong
andauthored
feat: organize parsed content (#21)
* word md 分类目录,word 中文文件名 * refactor: fix typo and logic --------- Co-authored-by: xuweidong <[email protected]>
1 parent 1a26dbe commit cb1db53

File tree

4 files changed

+29
-13
lines changed

4 files changed

+29
-13
lines changed

scripts/manager/cache.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,20 @@ def OUTPUT_PATH(self):
6060

6161
def write_law(self, path: Path, data: List[str]):
6262
full_path = self.OUTPUT_PATH / path
63-
folder_path = full_path.parents[0]
63+
folder_path = full_path.parent
6464
if not folder_path.exists():
65-
folder_path.mkdir()
65+
folder_path.mkdir(parents=True)
6666
with open(full_path, "w") as f:
6767
result = "\n\n".join(data)
6868
result = result.replace("<!-- TABLE -->\n", "<!-- TABLE -->")
6969
result = result.replace("\n<!-- TABLE END -->", "<!-- TABLE END -->")
7070
result = result.replace("|\n\n|", "|\n|")
7171
result = re.sub("\n{2,}", "\n\n", result)
7272
f.write(result)
73+
74+
75+
def word_output_path(self, key: str, type: CacheType, path: Path | str):
76+
p = self.base_path / type.value / path
77+
if not p.exists():
78+
p.mkdir(parents=True)
79+
return p / key

scripts/manager/request.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import json
2+
import logging
23
import os
34
import re
45
import urllib.request
56
from hashlib import sha1
7+
from pathlib import Path
68
from time import sleep
7-
import logging
9+
810
import requests
911
from docx import Document
1012
from manager.cache import CacheManager, CacheType
1113

12-
1314
logger = logging.getLogger(__name__)
1415

1516
REQUEST_HEADER = {
@@ -93,11 +94,14 @@ def get_html(self, url) -> str:
9394
self.cache.set(cache_key, CacheType.HTMLDocument, ret, "html")
9495
return ret
9596

96-
def get_word(self, url) -> Document:
97+
def get_word(self, url: str, title_or_output_path: Path) -> Document:
9798
filename = os.path.basename(url)
98-
cache_key = filename.split(".")[0]
99+
_, file_extension = os.path.splitext(filename)
100+
101+
title = title_or_output_path.name
102+
parent_path = title_or_output_path.parent
99103

100-
ok, path = self.cache.is_exists(cache_key, CacheType.WordDocument, "docx")
104+
ok, path = self.cache.is_exists(title, CacheType.WordDocument, file_extension)
101105
if not ok:
102106
if not re.match(".*docx$", filename):
103107
return None

scripts/parsers/word.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
import logging
22
import re
3+
from pathlib import Path
34
from typing import List, Tuple
45

5-
from docx.document import Document as _Document
6+
from common import LINE_RE
67
from docx import Document
8+
from docx.document import Document as _Document
9+
from docx.oxml import CT_SectPr
710
from docx.oxml.table import CT_Tbl
811
from docx.oxml.text.paragraph import CT_P
9-
from docx.oxml import CT_SectPr
1012
from docx.table import Table, _Cell, _Row
1113
from docx.text.paragraph import Paragraph
1214
from parsers.base import Parser
13-
from common import LINE_RE
1415

1516
logger = logging.getLogger(__name__)
1617

@@ -49,12 +50,14 @@ def iter_block_items(self, parent):
4950
yield Table(child, parent)
5051

5152
def parse(self, result, detail) -> Tuple[str, str, List[str]]:
52-
document = self.request.get_word(detail["path"])
53+
level = result["level"].strip()
54+
title = result["title"].strip()
55+
56+
document = self.request.get_word(detail["path"], Path(level) / title)
5357
if not document:
5458
logger.warning(f"document {detail['path']} not exists")
5559
return
5660

57-
title = result["title"].strip()
5861
return self.parse_document(document, title)
5962

6063
def parse_document(self, document, title):

scripts/request.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def parse_law(self, item):
7878
detail = self.request.get_law_detail(item["id"])
7979
result = detail["result"]
8080
title = result["title"]
81+
# 类别
82+
level = result["level"]
8183
files = self.__reorder_files(result["body"])
8284
logger.debug(f"parsing {title}")
8385
if len(files) == 0:
@@ -96,7 +98,7 @@ def parse_law(self, item):
9698
if not filedata:
9799
continue
98100

99-
output_path = self.__get_law_output_path(title, item["publish"])
101+
output_path = level / self.__get_law_output_path(title, item["publish"])
100102
logger.debug(f"parsing {title} success")
101103
self.cache.write_law(output_path, filedata)
102104

0 commit comments

Comments
 (0)