feat: add markdownify endpoint

VinciGit00 · VinciGit00 · commit 73403755da1e · 2025-06-13T12:41:21.000+02:00
diff --git a/examples/markdownify/.env.example b/examples/markdownify/.env.example
@@ -0,0 +1 @@
+SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY
diff --git a/examples/markdownify/markdownify_scrapegraphai.py b/examples/markdownify/markdownify_scrapegraphai.py
@@ -0,0 +1,35 @@
+"""
+Example script demonstrating the markdownify functionality
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraph_py import Client
+from scrapegraph_py.logger import sgai_logger
+
+def main():
+    # Load environment variables
+    load_dotenv()
+    
+    # Set up logging
+    sgai_logger.set_logging(level="INFO")
+
+    # Initialize the client
+    api_key = os.getenv("SCRAPEGRAPH_API_KEY")
+    if not api_key:
+        raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found")
+    sgai_client = Client(api_key=api_key)
+
+    # Example 1: Convert a website to Markdown
+    print("Example 1: Converting website to Markdown")
+    print("-" * 50)
+    response = sgai_client.markdownify(
+        website_url="https://example.com"
+    )
+    print("Markdown output:")
+    print(response["result"])  # Access the result key from the dictionary
+    print("\nMetadata:")
+    print(response.get("metadata", {}))  # Use get() with default value
+    print("\n" + "=" * 50 + "\n")
+if __name__ == "__main__":
+    main()
diff --git a/examples/markdownify/readme.md b/examples/markdownify/readme.md
@@ -0,0 +1,75 @@
+# Markdownify Graph Example
+
+This example demonstrates how to use the Markdownify graph to convert HTML content to Markdown format.
+
+## Features
+
+- Convert HTML content to clean, readable Markdown
+- Support for both URL and direct HTML input
+- Maintains formatting and structure of the original content
+- Handles complex HTML elements and nested structures
+
+## Usage
+
+```python
+from scrapegraphai import Client
+from scrapegraphai.logger import sgai_logger
+
+# Set up logging
+sgai_logger.set_logging(level="INFO")
+
+# Initialize the client
+sgai_client = Client(api_key="your-api-key")
+
+# Example 1: Convert a website to Markdown
+response = sgai_client.markdownify(
+    website_url="https://example.com"
+)
+print(response.markdown)
+
+# Example 2: Convert HTML content directly
+html_content = """
+<div>
+    <h1>Hello World</h1>
+    <p>This is a <strong>test</strong> paragraph.</p>
+</div>
+"""
+response = sgai_client.markdownify(
+    html_content=html_content
+)
+print(response.markdown)
+```
+
+## Parameters
+
+The `markdownify` method accepts the following parameters:
+
+- `website_url` (str, optional): The URL of the website to convert to Markdown
+- `html_content` (str, optional): Direct HTML content to convert to Markdown
+
+Note: You must provide either `website_url` or `html_content`, but not both.
+
+## Response
+
+The response object contains:
+
+- `markdown` (str): The converted Markdown content
+- `metadata` (dict): Additional information about the conversion process
+
+## Error Handling
+
+The graph handles various edge cases:
+
+- Invalid URLs
+- Malformed HTML
+- Network errors
+- Timeout issues
+
+If an error occurs, it will be logged and raised with appropriate error messages.
+
+## Best Practices
+
+1. Always provide a valid URL or well-formed HTML content
+2. Use appropriate logging levels for debugging
+3. Handle the response appropriately in your application
+4. Consider rate limiting for large-scale conversions
diff --git a/scrapegraphai/graphs/markdownify_graph.py b/scrapegraphai/graphs/markdownify_graph.py
@@ -0,0 +1,83 @@
+"""
+markdownify_graph module
+"""
+
+from typing import Dict, List, Optional, Tuple
+
+from ..nodes import (
+    FetchNode,
+    MarkdownifyNode,
+)
+from .base_graph import BaseGraph
+
+
+class MarkdownifyGraph(BaseGraph):
+    """
+    A graph that converts HTML content to Markdown format.
+
+    This graph takes a URL or HTML content as input and converts it to clean, readable Markdown.
+    It uses a two-step process:
+    1. Fetch the content (if URL is provided)
+    2. Convert the content to Markdown format
+
+    Args:
+        llm_model: The language model to use for processing
+        embedder_model: The embedding model to use (optional)
+        node_config: Additional configuration for the nodes (optional)
+
+    Example:
+        >>> graph = MarkdownifyGraph(
+        ...     llm_model=your_llm_model,
+        ...     embedder_model=your_embedder_model
+        ... )
+        >>> result, _ = graph.execute({"url": "https://example.com"})
+        >>> print(result["markdown"])
+    """
+
+    def __init__(
+        self,
+        llm_model,
+        embedder_model=None,
+        node_config: Optional[Dict] = None,
+    ):
+        # Initialize nodes
+        fetch_node = FetchNode(
+            input="url | html",
+            output=["html_content"],
+            node_config=node_config,
+        )
+
+        markdownify_node = MarkdownifyNode(
+            input="html_content",
+            output=["markdown"],
+            node_config=node_config,
+        )
+
+        # Define graph structure
+        nodes = [fetch_node, markdownify_node]
+        edges = [(fetch_node, markdownify_node)]
+
+        super().__init__(
+            nodes=nodes,
+            edges=edges,
+            entry_point=fetch_node,
+            graph_name="Markdownify",
+        )
+
+    def execute(
+        self, initial_state: Dict
+    ) -> Tuple[Dict, List[Dict]]:
+        """
+        Execute the markdownify graph.
+
+        Args:
+            initial_state: A dictionary containing either:
+                - "url": The URL to fetch and convert to markdown
+                - "html": The HTML content to convert to markdown
+
+        Returns:
+            Tuple containing:
+                - Dictionary with the markdown result in the "markdown" key
+                - List of execution logs
+        """
+        return super().execute(initial_state) 
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -20,6 +20,7 @@
 from .graph_iterator_node import GraphIteratorNode
 from .html_analyzer_node import HtmlAnalyzerNode
 from .image_to_text_node import ImageToTextNode
+from .markdownify_node import MarkdownifyNode
 from .merge_answers_node import MergeAnswersNode
 from .merge_generated_scripts_node import MergeGeneratedScriptsNode
 from .parse_node import ParseNode
@@ -45,6 +46,7 @@
     "ParseNode",
     "ParseNodeDepthK",
     "RobotsNode",
+    "MarkdownifyNode",
     # Analysis nodes
     "HtmlAnalyzerNode",
     "GetProbableTagsNode",
diff --git a/scrapegraphai/nodes/markdownify_node.py b/scrapegraphai/nodes/markdownify_node.py
@@ -0,0 +1,67 @@
+"""
+MarkdownifyNode Module
+"""
+
+from typing import List, Optional
+
+from ..utils.convert_to_md import convert_to_md
+from .base_node import BaseNode
+
+
+class MarkdownifyNode(BaseNode):
+    """
+    A node responsible for converting HTML content to Markdown format.
+
+    This node takes HTML content from the state and converts it to clean, readable Markdown.
+    It uses the convert_to_md utility function to perform the conversion.
+
+    Attributes:
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (Optional[dict]): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Markdownify".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "Markdownify",
+    ):
+        super().__init__(node_name, "node", input, output, 1, node_config)
+
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        """
+        Executes the node's logic to convert HTML content to Markdown.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used to fetch the
+                         HTML content from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the Markdown content.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating that the
+                     necessary HTML content is missing.
+        """
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        input_keys = self.get_input_keys(state)
+        html_content = state[input_keys[0]]
+
+        # Convert HTML to Markdown
+        markdown_content = convert_to_md(html_content)
+
+        # Update state with markdown content
+        state.update({self.output[0]: markdown_content})
+
+        return state 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+SCRAPEGRAPH_API_KEY=your SCRAPEGRAPH_API_KEY`