Skip to content

Commit 94e9ebd

Browse files
committed
feat: add scrapegraphai integration
1 parent 2a73821 commit 94e9ebd

File tree

9 files changed

+297
-30
lines changed

9 files changed

+297
-30
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY

examples/search_graph/scrapegraphai/readme.md

Whitespace-only changes.
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
Example implementation of search-based scraping using Scrapegraph AI.
3+
This example demonstrates how to use the searchscraper to extract information from the web.
4+
"""
5+
6+
import os
7+
from typing import Dict, Any
8+
from dotenv import load_dotenv
9+
from scrapegraph_py import Client
10+
from scrapegraph_py.logger import sgai_logger
11+
12+
def format_response(response: Dict[str, Any]) -> None:
13+
"""
14+
Format and print the search response in a readable way.
15+
16+
Args:
17+
response (Dict[str, Any]): The response from the search API
18+
"""
19+
print("\n" + "="*50)
20+
print("SEARCH RESULTS")
21+
print("="*50)
22+
23+
# Print request ID
24+
print(f"\nRequest ID: {response['request_id']}")
25+
26+
# Print number of sources
27+
urls = response.get('reference_urls', [])
28+
print(f"\nSources Processed: {len(urls)}")
29+
30+
# Print the extracted information
31+
print("\nExtracted Information:")
32+
print("-"*30)
33+
if isinstance(response['result'], dict):
34+
for key, value in response['result'].items():
35+
print(f"\n{key.upper()}:")
36+
if isinstance(value, list):
37+
for item in value:
38+
print(f" • {item}")
39+
else:
40+
print(f" {value}")
41+
else:
42+
print(response['result'])
43+
44+
# Print source URLs
45+
if urls:
46+
print("\nSources:")
47+
print("-"*30)
48+
for i, url in enumerate(urls, 1):
49+
print(f"{i}. {url}")
50+
print("\n" + "="*50)
51+
52+
def main():
53+
# Load environment variables
54+
load_dotenv()
55+
56+
# Get API key
57+
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
58+
if not api_key:
59+
raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables")
60+
61+
# Configure logging
62+
sgai_logger.set_logging(level="INFO")
63+
64+
# Initialize client
65+
sgai_client = Client(api_key=api_key)
66+
67+
try:
68+
# Basic search scraper example
69+
print("\nSearching for information...")
70+
71+
search_response = sgai_client.searchscraper(
72+
user_prompt="Extract webpage information"
73+
)
74+
format_response(search_response)
75+
76+
except Exception as e:
77+
print(f"\nError occurred: {str(e)}")
78+
finally:
79+
# Always close the client
80+
sgai_client.close()
81+
82+
if __name__ == "__main__":
83+
main()

examples/smart_scraper_graph/README.md

Lines changed: 0 additions & 30 deletions
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# Smart Scraper Examples with Scrapegraph AI
2+
3+
This repository contains examples demonstrating how to use Scrapegraph AI's powerful web scraping capabilities to transform websites into structured data using natural language prompts.
4+
5+
## About Scrapegraph AI
6+
7+
[Scrapegraph AI](https://scrapegraphai.com) is a powerful web scraping API that transforms any website into structured data for AI agents and analytics. It's built specifically for AI agents and LLMs, featuring natural language instructions and structured JSON output.
8+
9+
Key features:
10+
- Universal data extraction from any website
11+
- Intelligent processing with advanced AI
12+
- Lightning-fast setup with official SDKs
13+
- Enterprise-ready with automatic proxy rotation
14+
- Seamless integration with RAG systems
15+
16+
## Examples Included
17+
18+
### 1. Smart Scraper
19+
The `smartscraper_scrapegraphai.py` example demonstrates how to extract structured data from a single website using natural language prompts.
20+
21+
### 2. Search Scraper
22+
The `searchscraper_scrapegraphai.py` example shows how to:
23+
- Search the internet for relevant information
24+
- Extract structured data from multiple sources
25+
- Merge and analyze information from different websites
26+
- Get comprehensive answers to complex queries
27+
28+
## Prerequisites
29+
30+
- Python 3.7+
31+
- pip (Python package manager)
32+
33+
## Installation
34+
35+
1. Clone the repository:
36+
```bash
37+
git clone https://github.com/yourusername/Scrapegraph-ai.git
38+
cd Scrapegraph-ai
39+
```
40+
41+
2. Install required dependencies:
42+
```bash
43+
pip install -r requirements.txt
44+
```
45+
46+
3. Create a `.env` file in the `examples/smart_scraper_graph` directory with:
47+
```env
48+
SCRAPEGRAPH_API_KEY=your-api-key-here
49+
```
50+
51+
## Usage
52+
53+
### Smart Scraper Example
54+
```bash
55+
python smartscraper_scrapegraphai.py
56+
```
57+
58+
### Search Scraper Example
59+
```bash
60+
python searchscraper_scrapegraphai.py
61+
```
62+
63+
## Example Outputs
64+
65+
### Smart Scraper Output
66+
```python
67+
Request ID: abc123...
68+
Result: {
69+
"founders": [
70+
{
71+
"name": "Marco Vinciguerra",
72+
"role": "Founder & Software Engineer",
73+
"bio": "LinkedIn profile of Marco Vinciguerra"
74+
},
75+
{
76+
"name": "Lorenzo Padoan",
77+
"role": "Founder & CEO",
78+
"bio": "LinkedIn profile of Lorenzo Padoan"
79+
}
80+
]
81+
}
82+
Reference URLs: ["https://scrapegraphai.com/about"]
83+
```
84+
85+
### Search Scraper Output
86+
```python
87+
Request ID: xyz789...
88+
Number of sources processed: 3
89+
90+
Extracted Information:
91+
{
92+
"features": [
93+
"Universal data extraction",
94+
"Intelligent processing with AI",
95+
"Lightning-fast setup",
96+
"Enterprise-ready with proxy rotation"
97+
],
98+
"benefits": [
99+
"Perfect for AI agents and LLMs",
100+
"Natural language instructions",
101+
"Structured JSON output",
102+
"Seamless RAG integration"
103+
]
104+
}
105+
106+
Sources:
107+
1. https://scrapegraphai.com
108+
2. https://scrapegraphai.com/features
109+
3. https://scrapegraphai.com/docs
110+
```
111+
112+
## Features Demonstrated
113+
114+
- Environment variable configuration
115+
- API client initialization
116+
- Smart scraping with natural language prompts
117+
- Search-based scraping across multiple sources
118+
- Error handling and response processing
119+
- Secure credential management
120+
121+
## Pricing and Credits
122+
123+
Scrapegraph AI offers various pricing tiers:
124+
- Free: 50 credits included
125+
- Starter: $20/month, 5,000 credits
126+
- Growth: $100/month, 40,000 credits
127+
- Pro: $500/month, 250,000 credits
128+
- Enterprise: Custom solutions
129+
130+
Service costs:
131+
- Smart Scraper: 10 credits per webpage
132+
- Search Scraper: 30 credits per query
133+
134+
## Support and Resources
135+
136+
- [Official Documentation](https://scrapegraphai.com/docs)
137+
- [API Status](https://scrapegraphai.com/status)
138+
139+
140+
## Security Notes
141+
142+
- Never commit your `.env` file to version control
143+
- Keep your API key secure
144+
- Use environment variables for sensitive credentials
145+
146+
## License
147+
148+
This example is provided under the same license as Scrapegraph AI. See the [Terms of Service](https://scrapegraphai.com/terms) for more information.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
Example implementation using scrapegraph-py client directly.
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraph_py import Client
8+
from scrapegraph_py.logger import sgai_logger
9+
10+
def main():
11+
# Load environment variables from .env file
12+
load_dotenv()
13+
14+
# Get API key from environment variables
15+
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
16+
if not api_key:
17+
raise ValueError("SCRAPEGRAPH_API_KEY non trovato nelle variabili d'ambiente")
18+
19+
# Set up logging
20+
sgai_logger.set_logging(level="INFO")
21+
22+
# Initialize the client with API key from environment
23+
sgai_client = Client(api_key=api_key)
24+
25+
try:
26+
# SmartScraper request
27+
response = sgai_client.smartscraper(
28+
website_url="https://scrapegraphai.com",
29+
user_prompt="Extract the founders' informations"
30+
)
31+
32+
# Print the response
33+
print(f"Request ID: {response['request_id']}")
34+
print(f"Result: {response['result']}")
35+
if response.get('reference_urls'):
36+
print(f"Reference URLs: {response['reference_urls']}")
37+
38+
except Exception as e:
39+
print(f"Error occurred: {str(e)}")
40+
finally:
41+
# Always close the client
42+
sgai_client.close()
43+
44+
if __name__ == "__main__":
45+
main()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dependencies = [
3131
"jsonschema>=4.23.0",
3232
"duckduckgo-search>=7.2.1",
3333
"pydantic>=2.10.2",
34+
"scrapegraph-py>=0.1.0",
3435
]
3536

3637
readme = "README.md"

uv.lock

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)