Skip to content

Commit 7340375

Browse files
committed
feat: add markdownify endpoint
1 parent 94e9ebd commit 7340375

File tree

7 files changed

+264
-1
lines changed

7 files changed

+264
-1
lines changed

examples/markdownify/.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""
2+
Example script demonstrating the markdownify functionality
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraph_py import Client
8+
from scrapegraph_py.logger import sgai_logger
9+
10+
def main():
11+
# Load environment variables
12+
load_dotenv()
13+
14+
# Set up logging
15+
sgai_logger.set_logging(level="INFO")
16+
17+
# Initialize the client
18+
api_key = os.getenv("SCRAPEGRAPH_API_KEY")
19+
if not api_key:
20+
raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found")
21+
sgai_client = Client(api_key=api_key)
22+
23+
# Example 1: Convert a website to Markdown
24+
print("Example 1: Converting website to Markdown")
25+
print("-" * 50)
26+
response = sgai_client.markdownify(
27+
website_url="https://example.com"
28+
)
29+
print("Markdown output:")
30+
print(response["result"]) # Access the result key from the dictionary
31+
print("\nMetadata:")
32+
print(response.get("metadata", {})) # Use get() with default value
33+
print("\n" + "=" * 50 + "\n")
34+
if __name__ == "__main__":
35+
main()

examples/markdownify/readme.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Markdownify Graph Example
2+
3+
This example demonstrates how to use the Markdownify graph to convert HTML content to Markdown format.
4+
5+
## Features
6+
7+
- Convert HTML content to clean, readable Markdown
8+
- Support for both URL and direct HTML input
9+
- Maintains formatting and structure of the original content
10+
- Handles complex HTML elements and nested structures
11+
12+
## Usage
13+
14+
```python
15+
from scrapegraphai import Client
16+
from scrapegraphai.logger import sgai_logger
17+
18+
# Set up logging
19+
sgai_logger.set_logging(level="INFO")
20+
21+
# Initialize the client
22+
sgai_client = Client(api_key="your-api-key")
23+
24+
# Example 1: Convert a website to Markdown
25+
response = sgai_client.markdownify(
26+
website_url="https://example.com"
27+
)
28+
print(response.markdown)
29+
30+
# Example 2: Convert HTML content directly
31+
html_content = """
32+
<div>
33+
<h1>Hello World</h1>
34+
<p>This is a <strong>test</strong> paragraph.</p>
35+
</div>
36+
"""
37+
response = sgai_client.markdownify(
38+
html_content=html_content
39+
)
40+
print(response.markdown)
41+
```
42+
43+
## Parameters
44+
45+
The `markdownify` method accepts the following parameters:
46+
47+
- `website_url` (str, optional): The URL of the website to convert to Markdown
48+
- `html_content` (str, optional): Direct HTML content to convert to Markdown
49+
50+
Note: You must provide either `website_url` or `html_content`, but not both.
51+
52+
## Response
53+
54+
The response object contains:
55+
56+
- `markdown` (str): The converted Markdown content
57+
- `metadata` (dict): Additional information about the conversion process
58+
59+
## Error Handling
60+
61+
The graph handles various edge cases:
62+
63+
- Invalid URLs
64+
- Malformed HTML
65+
- Network errors
66+
- Timeout issues
67+
68+
If an error occurs, it will be logged and raised with appropriate error messages.
69+
70+
## Best Practices
71+
72+
1. Always provide a valid URL or well-formed HTML content
73+
2. Use appropriate logging levels for debugging
74+
3. Handle the response appropriately in your application
75+
4. Consider rate limiting for large-scale conversions
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
markdownify_graph module
3+
"""
4+
5+
from typing import Dict, List, Optional, Tuple
6+
7+
from ..nodes import (
8+
FetchNode,
9+
MarkdownifyNode,
10+
)
11+
from .base_graph import BaseGraph
12+
13+
14+
class MarkdownifyGraph(BaseGraph):
15+
"""
16+
A graph that converts HTML content to Markdown format.
17+
18+
This graph takes a URL or HTML content as input and converts it to clean, readable Markdown.
19+
It uses a two-step process:
20+
1. Fetch the content (if URL is provided)
21+
2. Convert the content to Markdown format
22+
23+
Args:
24+
llm_model: The language model to use for processing
25+
embedder_model: The embedding model to use (optional)
26+
node_config: Additional configuration for the nodes (optional)
27+
28+
Example:
29+
>>> graph = MarkdownifyGraph(
30+
... llm_model=your_llm_model,
31+
... embedder_model=your_embedder_model
32+
... )
33+
>>> result, _ = graph.execute({"url": "https://example.com"})
34+
>>> print(result["markdown"])
35+
"""
36+
37+
def __init__(
38+
self,
39+
llm_model,
40+
embedder_model=None,
41+
node_config: Optional[Dict] = None,
42+
):
43+
# Initialize nodes
44+
fetch_node = FetchNode(
45+
input="url | html",
46+
output=["html_content"],
47+
node_config=node_config,
48+
)
49+
50+
markdownify_node = MarkdownifyNode(
51+
input="html_content",
52+
output=["markdown"],
53+
node_config=node_config,
54+
)
55+
56+
# Define graph structure
57+
nodes = [fetch_node, markdownify_node]
58+
edges = [(fetch_node, markdownify_node)]
59+
60+
super().__init__(
61+
nodes=nodes,
62+
edges=edges,
63+
entry_point=fetch_node,
64+
graph_name="Markdownify",
65+
)
66+
67+
def execute(
68+
self, initial_state: Dict
69+
) -> Tuple[Dict, List[Dict]]:
70+
"""
71+
Execute the markdownify graph.
72+
73+
Args:
74+
initial_state: A dictionary containing either:
75+
- "url": The URL to fetch and convert to markdown
76+
- "html": The HTML content to convert to markdown
77+
78+
Returns:
79+
Tuple containing:
80+
- Dictionary with the markdown result in the "markdown" key
81+
- List of execution logs
82+
"""
83+
return super().execute(initial_state)

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from .graph_iterator_node import GraphIteratorNode
2121
from .html_analyzer_node import HtmlAnalyzerNode
2222
from .image_to_text_node import ImageToTextNode
23+
from .markdownify_node import MarkdownifyNode
2324
from .merge_answers_node import MergeAnswersNode
2425
from .merge_generated_scripts_node import MergeGeneratedScriptsNode
2526
from .parse_node import ParseNode
@@ -45,6 +46,7 @@
4546
"ParseNode",
4647
"ParseNodeDepthK",
4748
"RobotsNode",
49+
"MarkdownifyNode",
4850
# Analysis nodes
4951
"HtmlAnalyzerNode",
5052
"GetProbableTagsNode",
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
MarkdownifyNode Module
3+
"""
4+
5+
from typing import List, Optional
6+
7+
from ..utils.convert_to_md import convert_to_md
8+
from .base_node import BaseNode
9+
10+
11+
class MarkdownifyNode(BaseNode):
12+
"""
13+
A node responsible for converting HTML content to Markdown format.
14+
15+
This node takes HTML content from the state and converts it to clean, readable Markdown.
16+
It uses the convert_to_md utility function to perform the conversion.
17+
18+
Attributes:
19+
verbose (bool): A flag indicating whether to show print statements during execution.
20+
21+
Args:
22+
input (str): Boolean expression defining the input keys needed from the state.
23+
output (List[str]): List of output keys to be updated in the state.
24+
node_config (Optional[dict]): Additional configuration for the node.
25+
node_name (str): The unique identifier name for the node, defaulting to "Markdownify".
26+
"""
27+
28+
def __init__(
29+
self,
30+
input: str,
31+
output: List[str],
32+
node_config: Optional[dict] = None,
33+
node_name: str = "Markdownify",
34+
):
35+
super().__init__(node_name, "node", input, output, 1, node_config)
36+
37+
self.verbose = (
38+
False if node_config is None else node_config.get("verbose", False)
39+
)
40+
41+
def execute(self, state: dict) -> dict:
42+
"""
43+
Executes the node's logic to convert HTML content to Markdown.
44+
45+
Args:
46+
state (dict): The current state of the graph. The input keys will be used to fetch the
47+
HTML content from the state.
48+
49+
Returns:
50+
dict: The updated state with the output key containing the Markdown content.
51+
52+
Raises:
53+
KeyError: If the input keys are not found in the state, indicating that the
54+
necessary HTML content is missing.
55+
"""
56+
self.logger.info(f"--- Executing {self.node_name} Node ---")
57+
58+
input_keys = self.get_input_keys(state)
59+
html_content = state[input_keys[0]]
60+
61+
# Convert HTML to Markdown
62+
markdown_content = convert_to_md(html_content)
63+
64+
# Update state with markdown content
65+
state.update({self.output[0]: markdown_content})
66+
67+
return state

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)