11import os
2+ import hashlib
3+
4+ def calculate_file_hash (file_path ):
5+ """计算文件的SHA-256哈希值"""
6+ hash_sha256 = hashlib .sha256 ()
7+ with open (file_path , 'rb' ) as f :
8+ for chunk in iter (lambda : f .read (4096 ), b"" ):
9+ hash_sha256 .update (chunk )
10+ return hash_sha256 .hexdigest ()
211
312def count_files_in_categories (base_dir ):
413 category_counts = {}
14+ unique_hashes = set ()
515
616 # 遍历输出目录中的所有分类目录
717 for category in os .listdir (base_dir ):
818 category_path = os .path .join (base_dir , category )
919 if os .path .isdir (category_path ):
10- # 统计分类目录中的文件数量
11- file_count = len ([f for f in os .listdir (category_path ) if os .path .isfile (os .path .join (category_path , f ))])
20+ file_count = 0
21+
22+ # 遍历分类目录中的文件
23+ for f in os .listdir (category_path ):
24+ file_path = os .path .join (category_path , f )
25+ if os .path .isfile (file_path ):
26+ file_count += 1
27+ # 计算文件哈希并加入集合(用于去重)
28+ file_hash = calculate_file_hash (file_path )
29+ unique_hashes .add (file_hash )
30+
1231 category_counts [category ] = file_count
1332
14- return category_counts
33+ return category_counts , unique_hashes
1534
1635# 输出目录路径
1736output_path = "poc"
1837
1938# 调用统计函数
20- category_counts = count_files_in_categories (output_path )
39+ category_counts , unique_hashes = count_files_in_categories (output_path )
2140
2241# 打印统计结果
2342print ("各类文件数量:" )
@@ -27,3 +46,7 @@ def count_files_in_categories(base_dir):
2746 total_files += count
2847
2948print (f"总文件数量: { total_files } " )
49+
50+ # 打印去重后文件数量
51+ unique_file_count = len (unique_hashes )
52+ print (f"去重后文件数量: { unique_file_count } " )
0 commit comments