add crawling

darjeeling · darjeeling · commit 4b3c0127bf67 · 2025-06-13T11:31:22.000+09:00
diff --git a/pythonkr_backend/curation/migrations/0008_rssitem_crawled_content_rssitem_crawling_status.py b/pythonkr_backend/curation/migrations/0008_rssitem_crawled_content_rssitem_crawling_status.py
@@ -0,0 +1,24 @@
+# Generated by Django 5.2.1 on 2025-06-13 02:12
+
+import curation.models
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('curation', '0007_crawlingsources_rssfeed_crawlingsite_crawlurl_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='rssitem',
+            name='crawled_content',
+            field=models.FileField(blank=True, help_text='크롤링된 마크다운 콘텐츠 파일', null=True, upload_to=curation.models.rss_item_upload_path),
+        ),
+        migrations.AddField(
+            model_name='rssitem',
+            name='crawling_status',
+            field=models.CharField(choices=[('pending', '크롤링 대기'), ('completed', '크롤링 완료'), ('failed', '크롤링 실패')], default='pending', help_text='크롤링 상태', max_length=20),
+        ),
+    ]
diff --git a/pythonkr_backend/curation/migrations/0009_rssitem_crawled_at_rssitem_error_message.py b/pythonkr_backend/curation/migrations/0009_rssitem_crawled_at_rssitem_error_message.py
@@ -0,0 +1,23 @@
+# Generated by Django 5.2.1 on 2025-06-13 02:16
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('curation', '0008_rssitem_crawled_content_rssitem_crawling_status'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='rssitem',
+            name='crawled_at',
+            field=models.DateTimeField(blank=True, help_text='크롤링 완료 시간', null=True),
+        ),
+        migrations.AddField(
+            model_name='rssitem',
+            name='error_message',
+            field=models.TextField(blank=True, help_text='크롤링 실패 시 에러 메시지'),
+        ),
+    ]
diff --git a/pythonkr_backend/curation/models.py b/pythonkr_backend/curation/models.py
@@ -6,6 +6,13 @@
 from datetime import timedelta
 
 
+def rss_item_upload_path(instance, filename):
+    """Generate upload path for RSS item crawled content"""
+    from datetime import datetime
+    now = datetime.now()
+    return f"rssitem-crawling/{now.year}/{now.month:02d}/{instance.id}-crawl.md"
+
+
 class Category(models.Model):
     name = models.CharField(max_length=100, unique=True, help_text="The name of the category (e.g., 'Web Development', 'LLM').")
     slug = models.SlugField(max_length=100, unique=True, help_text="A URL-friendly slug for the category.", blank=True) # Optional but good practice
@@ -286,6 +293,12 @@ class Meta:
 
 
 class RSSItem(models.Model):
+    CRAWLING_STATUS_CHOICES = [
+        ('pending', '크롤링 대기'),
+        ('completed', '크롤링 완료'),
+        ('failed', '크롤링 실패'),
+    ]
+    
     feed = models.ForeignKey(
         RSSFeed,
         on_delete=models.CASCADE,
@@ -299,6 +312,27 @@ class RSSItem(models.Model):
     category = models.CharField(max_length=200, blank=True, help_text="카테고리")
     guid = models.CharField(max_length=500, blank=True, unique=True, help_text="GUID")
     pub_date = models.DateTimeField(null=True, blank=True, help_text="발행일")
+    crawling_status = models.CharField(
+        max_length=20,
+        choices=CRAWLING_STATUS_CHOICES,
+        default='pending',
+        help_text="크롤링 상태"
+    )
+    crawled_content = models.FileField(
+        upload_to=rss_item_upload_path,
+        blank=True,
+        null=True,
+        help_text="크롤링된 마크다운 콘텐츠 파일"
+    )
+    crawled_at = models.DateTimeField(
+        null=True,
+        blank=True,
+        help_text="크롤링 완료 시간"
+    )
+    error_message = models.TextField(
+        blank=True,
+        help_text="크롤링 실패 시 에러 메시지"
+    )
     created_at = models.DateTimeField(auto_now_add=True)
 
     def __str__(self):
diff --git a/pythonkr_backend/curation/tasks.py b/pythonkr_backend/curation/tasks.py
@@ -7,6 +7,7 @@
 import requests
 from datetime import datetime, timezone
 from django.utils import timezone as django_timezone
+from django.core.files.base import ContentFile
 from .models import RSSFeed, RSSItem
 import logging
 
@@ -150,3 +151,87 @@ def crawl_rss():
     logfire.info("start to crawl rss")
     logger.info("start to crawl rss")
     return crawl_all_rss_feeds()
+
+
+@shared_task
+def crawl_rss_item_content():
+    """RSS 아이템의 본문을 크롤링하는 태스크 (10분마다 실행)"""
+    logfire.info("Starting RSS item content crawling")
+    logger.info("Starting RSS item content crawling")
+    
+    # 크롤링되지 않은 최신 1개 아이템 가져오기
+    pending_item = RSSItem.objects.filter(
+        crawling_status='pending'
+    ).order_by('-pub_date', '-created_at').first()
+    
+    if not pending_item:
+        logfire.info("No pending RSS items to crawl")
+        logger.info("No pending RSS items to crawl")
+        return {"status": "no_items", "message": "No pending items to crawl"}
+    
+    logfire.info(f"Crawling RSS item: {pending_item.title} ({pending_item.link})")
+    logger.info(f"Crawling RSS item: {pending_item.title} ({pending_item.link})")
+    
+    # 크롤링 상태를 진행 중으로 변경 (동시 처리 방지)
+    pending_item.crawling_status = 'completed'  # 임시로 설정하여 중복 처리 방지
+    pending_item.save(update_fields=['crawling_status'])
+    
+    try:
+        # Jina AI를 사용하여 콘텐츠 크롤링
+        jina_url = f"https://r.jina.ai/{pending_item.link}"
+        
+        response = requests.get(jina_url, timeout=30)
+        response.raise_for_status()
+        
+        # 마크다운 콘텐츠 저장
+        markdown_content = response.text
+        
+        # 파일 저장
+        filename = f"{pending_item.id}-crawl.md"
+        content_file = ContentFile(markdown_content.encode('utf-8'))
+        
+        pending_item.crawled_content.save(filename, content_file, save=False)
+        pending_item.crawling_status = 'completed'
+        pending_item.crawled_at = django_timezone.now()
+        pending_item.error_message = ''  # 성공 시 에러 메시지 초기화
+        pending_item.save()
+        
+        logfire.info(f"Successfully crawled RSS item: {pending_item.title}")
+        logger.info(f"Successfully crawled RSS item: {pending_item.title}")
+        
+        return {
+            "status": "success",
+            "item_id": pending_item.id,
+            "item_title": pending_item.title,
+            "content_length": len(markdown_content)
+        }
+        
+    except requests.RequestException as e:
+        error_msg = f"Network error while crawling {pending_item.link}: {str(e)}"
+        pending_item.crawling_status = 'failed'
+        pending_item.error_message = error_msg
+        pending_item.save(update_fields=['crawling_status', 'error_message'])
+        
+        logfire.error(error_msg)
+        logger.error(error_msg)
+        
+        return {
+            "status": "failed",
+            "item_id": pending_item.id,
+            "error": error_msg
+        }
+        
+    except Exception as e:
+        error_msg = f"Unexpected error while crawling {pending_item.link}: {str(e)}"
+        pending_item.crawling_status = 'failed'
+        pending_item.error_message = error_msg
+        pending_item.save(update_fields=['crawling_status', 'error_message'])
+        
+        logfire.error(error_msg)
+        logger.error(error_msg)
+        
+        return {
+            "status": "failed",
+            "item_id": pending_item.id,
+            "error": error_msg
+        }
diff --git a/pythonkr_backend/pythonkr_backend/settings/prod.py b/pythonkr_backend/pythonkr_backend/settings/prod.py
@@ -97,4 +97,9 @@
         'schedule': crontab(minute='*/10'),  # Every 10 minutes
         'options': {'queue': 'celery'}
     },
+    'crawl-rss-item-content': {
+        'task': 'curation.tasks.crawl_rss_item_content',
+        'schedule': crontab(minute='*/10'),  # Every 10 minutes
+        'options': {'queue': 'celery'}
+    },
 }