feat: add scrapegraphai integration

VinciGit00 · VinciGit00 · commit 94e9ebd28061 · 2025-06-13T11:12:09.000+02:00
diff --git a/examples/search_graph/scrapegraphai/.env.example b/examples/search_graph/scrapegraphai/.env.example
@@ -0,0 +1 @@
+SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY
diff --git a/examples/search_graph/scrapegraphai/readme.md b/examples/search_graph/scrapegraphai/readme.md
diff --git a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py
@@ -0,0 +1,83 @@
+"""
+Example implementation of search-based scraping using Scrapegraph AI.
+This example demonstrates how to use the searchscraper to extract information from the web.
+"""
+
+import os
+from typing import Dict, Any
+from dotenv import load_dotenv
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+
+def format_response(response: Dict[str, Any]) -> None:
+    """
+    Format and print the search response in a readable way.
+    
+    Args:
+        response (Dict[str, Any]): The response from the search API
+    """
+    print("\n" + "="*50)
+    print("SEARCH RESULTS")
+    print("="*50)
+    
+    # Print request ID
+    print(f"\nRequest ID: {response['request_id']}")
+    
+    # Print number of sources
+    urls = response.get('reference_urls', [])
+    print(f"\nSources Processed: {len(urls)}")
+    
+    # Print the extracted information
+    print("\nExtracted Information:")
+    print("-"*30)
+    if isinstance(response['result'], dict):
+        for key, value in response['result'].items():
+            print(f"\n{key.upper()}:")
+            if isinstance(value, list):
+                for item in value:
+                    print(f"  • {item}")
+            else:
+                print(f"  {value}")
+    else:
+        print(response['result'])
+    
+    # Print source URLs
+    if urls:
+        print("\nSources:")
+        print("-"*30)
+        for i, url in enumerate(urls, 1):
+            print(f"{i}. {url}")
+    print("\n" + "="*50)
+
+def main():
+    # Load environment variables
+    load_dotenv()
+    
+    # Get API key
+    api_key = os.getenv("SCRAPEGRAPH_API_KEY")
+    if not api_key:
+        raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables")
+
+    # Configure logging
+    sgai_logger.set_logging(level="INFO")
+
+    # Initialize client
+    sgai_client = Client(api_key=api_key)
+
+    try:
+        # Basic search scraper example
+        print("\nSearching for information...")
+        
+        search_response = sgai_client.searchscraper(
+            user_prompt="Extract webpage information"
+        )
+        format_response(search_response)
+
+    except Exception as e:
+        print(f"\nError occurred: {str(e)}")
+    finally:
+        # Always close the client
+        sgai_client.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/smart_scraper_graph/README.md b/examples/smart_scraper_graph/README.md
diff --git a/examples/smart_scraper_graph/scrapegraphai/.env.example b/examples/smart_scraper_graph/scrapegraphai/.env.example
@@ -0,0 +1 @@
+SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY
diff --git a/examples/smart_scraper_graph/scrapegraphai/readme.md b/examples/smart_scraper_graph/scrapegraphai/readme.md
@@ -0,0 +1,148 @@
+# Smart Scraper Examples with Scrapegraph AI
+
+This repository contains examples demonstrating how to use Scrapegraph AI's powerful web scraping capabilities to transform websites into structured data using natural language prompts.
+
+## About Scrapegraph AI
+
+[Scrapegraph AI](https://scrapegraphai.com) is a powerful web scraping API that transforms any website into structured data for AI agents and analytics. It's built specifically for AI agents and LLMs, featuring natural language instructions and structured JSON output.
+
+Key features:
+- Universal data extraction from any website
+- Intelligent processing with advanced AI
+- Lightning-fast setup with official SDKs
+- Enterprise-ready with automatic proxy rotation
+- Seamless integration with RAG systems
+
+## Examples Included
+
+### 1. Smart Scraper
+The `smartscraper_scrapegraphai.py` example demonstrates how to extract structured data from a single website using natural language prompts.
+
+### 2. Search Scraper
+The `searchscraper_scrapegraphai.py` example shows how to:
+- Search the internet for relevant information
+- Extract structured data from multiple sources
+- Merge and analyze information from different websites
+- Get comprehensive answers to complex queries
+
+## Prerequisites
+
+- Python 3.7+
+- pip (Python package manager)
+
+## Installation
+
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/Scrapegraph-ai.git
+cd Scrapegraph-ai
+```
+
+2. Install required dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+3. Create a `.env` file in the `examples/smart_scraper_graph` directory with:
+```env
+SCRAPEGRAPH_API_KEY=your-api-key-here
+```
+
+## Usage
+
+### Smart Scraper Example
+```bash
+python smartscraper_scrapegraphai.py
+```
+
+### Search Scraper Example
+```bash
+python searchscraper_scrapegraphai.py
+```
+
+## Example Outputs
+
+### Smart Scraper Output
+```python
+Request ID: abc123...
+Result: {
+    "founders": [
+        {
+            "name": "Marco Vinciguerra",
+            "role": "Founder & Software Engineer",
+            "bio": "LinkedIn profile of Marco Vinciguerra"
+        },
+        {
+            "name": "Lorenzo Padoan",
+            "role": "Founder & CEO",
+            "bio": "LinkedIn profile of Lorenzo Padoan"
+        }
+    ]
+}
+Reference URLs: ["https://scrapegraphai.com/about"]
+```
+
+### Search Scraper Output
+```python
+Request ID: xyz789...
+Number of sources processed: 3
+
+Extracted Information:
+{
+    "features": [
+        "Universal data extraction",
+        "Intelligent processing with AI",
+        "Lightning-fast setup",
+        "Enterprise-ready with proxy rotation"
+    ],
+    "benefits": [
+        "Perfect for AI agents and LLMs",
+        "Natural language instructions",
+        "Structured JSON output",
+        "Seamless RAG integration"
+    ]
+}
+
+Sources:
+1. https://scrapegraphai.com
+2. https://scrapegraphai.com/features
+3. https://scrapegraphai.com/docs
+```
+
+## Features Demonstrated
+
+- Environment variable configuration
+- API client initialization
+- Smart scraping with natural language prompts
+- Search-based scraping across multiple sources
+- Error handling and response processing
+- Secure credential management
+
+## Pricing and Credits
+
+Scrapegraph AI offers various pricing tiers:
+- Free: 50 credits included
+- Starter: $20/month, 5,000 credits
+- Growth: $100/month, 40,000 credits
+- Pro: $500/month, 250,000 credits
+- Enterprise: Custom solutions
+
+Service costs:
+- Smart Scraper: 10 credits per webpage
+- Search Scraper: 30 credits per query
+
+## Support and Resources
+
+- [Official Documentation](https://scrapegraphai.com/docs)
+- [API Status](https://scrapegraphai.com/status)
+- Contact: contact@scrapegraphai.com
+
+## Security Notes
+
+- Never commit your `.env` file to version control
+- Keep your API key secure
+- Use environment variables for sensitive credentials
+
+## License
+
+This example is provided under the same license as Scrapegraph AI. See the [Terms of Service](https://scrapegraphai.com/terms) for more information.
diff --git a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py
@@ -0,0 +1,45 @@
+"""
+Example implementation using scrapegraph-py client directly.
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+
+def main():
+    # Load environment variables from .env file
+    load_dotenv()
+    
+    # Get API key from environment variables
+    api_key = os.getenv("SCRAPEGRAPH_API_KEY")
+    if not api_key:
+        raise ValueError("SCRAPEGRAPH_API_KEY non trovato nelle variabili d'ambiente")
+
+    # Set up logging
+    sgai_logger.set_logging(level="INFO")
+
+    # Initialize the client with API key from environment
+    sgai_client = Client(api_key=api_key)
+
+    try:
+        # SmartScraper request
+        response = sgai_client.smartscraper(
+            website_url="https://scrapegraphai.com",
+            user_prompt="Extract the founders' informations"
+        )
+
+        # Print the response
+        print(f"Request ID: {response['request_id']}")
+        print(f"Result: {response['result']}")
+        if response.get('reference_urls'):
+            print(f"Reference URLs: {response['reference_urls']}")
+
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+    finally:
+        # Always close the client
+        sgai_client.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "jsonschema>=4.23.0",
     "duckduckgo-search>=7.2.1",
     "pydantic>=2.10.2",
+    "scrapegraph-py>=0.1.0",
 ]
 
 readme = "README.md"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ dependencies = [`
`31`	`31`	`"jsonschema>=4.23.0",`
`32`	`32`	`"duckduckgo-search>=7.2.1",`
`33`	`33`	`"pydantic>=2.10.2",`
	`34`	`+ "scrapegraph-py>=0.1.0",`
`34`	`35`	`]`
`35`	`36`
`36`	`37`	`readme = "README.md"`