Skip to content

Commit 4b3c012

Browse files
committed
add crawling
1 parent 9433209 commit 4b3c012

File tree

5 files changed

+171
-0
lines changed

5 files changed

+171
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Generated by Django 5.2.1 on 2025-06-13 02:12
2+
3+
import curation.models
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
('curation', '0007_crawlingsources_rssfeed_crawlingsite_crawlurl_and_more'),
11+
]
12+
13+
operations = [
14+
migrations.AddField(
15+
model_name='rssitem',
16+
name='crawled_content',
17+
field=models.FileField(blank=True, help_text='크롤링된 마크다운 콘텐츠 파일', null=True, upload_to=curation.models.rss_item_upload_path),
18+
),
19+
migrations.AddField(
20+
model_name='rssitem',
21+
name='crawling_status',
22+
field=models.CharField(choices=[('pending', '크롤링 대기'), ('completed', '크롤링 완료'), ('failed', '크롤링 실패')], default='pending', help_text='크롤링 상태', max_length=20),
23+
),
24+
]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Generated by Django 5.2.1 on 2025-06-13 02:16
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('curation', '0008_rssitem_crawled_content_rssitem_crawling_status'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='rssitem',
15+
name='crawled_at',
16+
field=models.DateTimeField(blank=True, help_text='크롤링 완료 시간', null=True),
17+
),
18+
migrations.AddField(
19+
model_name='rssitem',
20+
name='error_message',
21+
field=models.TextField(blank=True, help_text='크롤링 실패 시 에러 메시지'),
22+
),
23+
]

pythonkr_backend/curation/models.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
from datetime import timedelta
77

88

9+
def rss_item_upload_path(instance, filename):
10+
"""Generate upload path for RSS item crawled content"""
11+
from datetime import datetime
12+
now = datetime.now()
13+
return f"rssitem-crawling/{now.year}/{now.month:02d}/{instance.id}-crawl.md"
14+
15+
916
class Category(models.Model):
1017
name = models.CharField(max_length=100, unique=True, help_text="The name of the category (e.g., 'Web Development', 'LLM').")
1118
slug = models.SlugField(max_length=100, unique=True, help_text="A URL-friendly slug for the category.", blank=True) # Optional but good practice
@@ -286,6 +293,12 @@ class Meta:
286293

287294

288295
class RSSItem(models.Model):
296+
CRAWLING_STATUS_CHOICES = [
297+
('pending', '크롤링 대기'),
298+
('completed', '크롤링 완료'),
299+
('failed', '크롤링 실패'),
300+
]
301+
289302
feed = models.ForeignKey(
290303
RSSFeed,
291304
on_delete=models.CASCADE,
@@ -299,6 +312,27 @@ class RSSItem(models.Model):
299312
category = models.CharField(max_length=200, blank=True, help_text="카테고리")
300313
guid = models.CharField(max_length=500, blank=True, unique=True, help_text="GUID")
301314
pub_date = models.DateTimeField(null=True, blank=True, help_text="발행일")
315+
crawling_status = models.CharField(
316+
max_length=20,
317+
choices=CRAWLING_STATUS_CHOICES,
318+
default='pending',
319+
help_text="크롤링 상태"
320+
)
321+
crawled_content = models.FileField(
322+
upload_to=rss_item_upload_path,
323+
blank=True,
324+
null=True,
325+
help_text="크롤링된 마크다운 콘텐츠 파일"
326+
)
327+
crawled_at = models.DateTimeField(
328+
null=True,
329+
blank=True,
330+
help_text="크롤링 완료 시간"
331+
)
332+
error_message = models.TextField(
333+
blank=True,
334+
help_text="크롤링 실패 시 에러 메시지"
335+
)
302336
created_at = models.DateTimeField(auto_now_add=True)
303337

304338
def __str__(self):

pythonkr_backend/curation/tasks.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import requests
88
from datetime import datetime, timezone
99
from django.utils import timezone as django_timezone
10+
from django.core.files.base import ContentFile
1011
from .models import RSSFeed, RSSItem
1112
import logging
1213

@@ -150,3 +151,87 @@ def crawl_rss():
150151
logfire.info("start to crawl rss")
151152
logger.info("start to crawl rss")
152153
return crawl_all_rss_feeds()
154+
155+
156+
@shared_task
157+
def crawl_rss_item_content():
158+
"""RSS 아이템의 본문을 크롤링하는 태스크 (10분마다 실행)"""
159+
logfire.info("Starting RSS item content crawling")
160+
logger.info("Starting RSS item content crawling")
161+
162+
# 크롤링되지 않은 최신 1개 아이템 가져오기
163+
pending_item = RSSItem.objects.filter(
164+
crawling_status='pending'
165+
).order_by('-pub_date', '-created_at').first()
166+
167+
if not pending_item:
168+
logfire.info("No pending RSS items to crawl")
169+
logger.info("No pending RSS items to crawl")
170+
return {"status": "no_items", "message": "No pending items to crawl"}
171+
172+
logfire.info(f"Crawling RSS item: {pending_item.title} ({pending_item.link})")
173+
logger.info(f"Crawling RSS item: {pending_item.title} ({pending_item.link})")
174+
175+
# 크롤링 상태를 진행 중으로 변경 (동시 처리 방지)
176+
pending_item.crawling_status = 'completed' # 임시로 설정하여 중복 처리 방지
177+
pending_item.save(update_fields=['crawling_status'])
178+
179+
try:
180+
# Jina AI를 사용하여 콘텐츠 크롤링
181+
jina_url = f"https://r.jina.ai/{pending_item.link}"
182+
183+
response = requests.get(jina_url, timeout=30)
184+
response.raise_for_status()
185+
186+
# 마크다운 콘텐츠 저장
187+
markdown_content = response.text
188+
189+
# 파일 저장
190+
filename = f"{pending_item.id}-crawl.md"
191+
content_file = ContentFile(markdown_content.encode('utf-8'))
192+
193+
pending_item.crawled_content.save(filename, content_file, save=False)
194+
pending_item.crawling_status = 'completed'
195+
pending_item.crawled_at = django_timezone.now()
196+
pending_item.error_message = '' # 성공 시 에러 메시지 초기화
197+
pending_item.save()
198+
199+
logfire.info(f"Successfully crawled RSS item: {pending_item.title}")
200+
logger.info(f"Successfully crawled RSS item: {pending_item.title}")
201+
202+
return {
203+
"status": "success",
204+
"item_id": pending_item.id,
205+
"item_title": pending_item.title,
206+
"content_length": len(markdown_content)
207+
}
208+
209+
except requests.RequestException as e:
210+
error_msg = f"Network error while crawling {pending_item.link}: {str(e)}"
211+
pending_item.crawling_status = 'failed'
212+
pending_item.error_message = error_msg
213+
pending_item.save(update_fields=['crawling_status', 'error_message'])
214+
215+
logfire.error(error_msg)
216+
logger.error(error_msg)
217+
218+
return {
219+
"status": "failed",
220+
"item_id": pending_item.id,
221+
"error": error_msg
222+
}
223+
224+
except Exception as e:
225+
error_msg = f"Unexpected error while crawling {pending_item.link}: {str(e)}"
226+
pending_item.crawling_status = 'failed'
227+
pending_item.error_message = error_msg
228+
pending_item.save(update_fields=['crawling_status', 'error_message'])
229+
230+
logfire.error(error_msg)
231+
logger.error(error_msg)
232+
233+
return {
234+
"status": "failed",
235+
"item_id": pending_item.id,
236+
"error": error_msg
237+
}

pythonkr_backend/pythonkr_backend/settings/prod.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,9 @@
9797
'schedule': crontab(minute='*/10'), # Every 10 minutes
9898
'options': {'queue': 'celery'}
9999
},
100+
'crawl-rss-item-content': {
101+
'task': 'curation.tasks.crawl_rss_item_content',
102+
'schedule': crontab(minute='*/10'), # Every 10 minutes
103+
'options': {'queue': 'celery'}
104+
},
100105
}

0 commit comments

Comments
 (0)