onlook-dev · Kitenite · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/apps/studio/.env.example b/apps/studio/.env.example
@@ -4,4 +4,6 @@ VITE_SUPABASE_ANON_KEY=
 VITE_MIXPANEL_TOKEN=
 
 # Add your keys here to use Anthropic directly
-VITE_ANTHROPIC_API_KEY=
+VITE_ANTHROPIC_API_KEY=
+# Add your Firecrawl API key here
+VITE_FIRECRAWL_API_KEY=
diff --git a/apps/studio/electron/main/chat/index.ts b/apps/studio/electron/main/chat/index.ts
@@ -25,6 +25,7 @@ import { z } from 'zod';
 import { mainWindow } from '..';
 import { PersistentStorage } from '../storage';
 import { initModel } from './llmProvider';
+import { extractUrls } from '@onlook/ai/src/tools/helpers';
 
 class LlmManager {
     private static instance: LlmManager;
@@ -59,6 +60,43 @@ class LlmManager {
         return LlmManager.instance;
     }
 
+    private async processUrls(content: string): Promise<string> {
+        const urls = extractUrls(content);
+
+        if (urls.length === 0) {
+            return content;
+        }
+
+        try {
+            const result = await streamText({
+                model: await initModel(LLMProvider.ANTHROPIC, CLAUDE_MODELS.SONNET, {
+                    requestType: StreamRequestType.SUGGESTIONS,
+                }),
+                messages: [
+                    {
+                        role: 'user',
+                        content: content,
+                    },
+                ],
+                tools: { crawl_urls: chatToolSet.crawl_urls },
+                maxTokens: 4000,
+            });
+
+            const crawledContent = `
+            Original request:
+            ${content}
+
+            Referenced content from URLs:
+            ${JSON.stringify(result, null, 2)}
+            `;
+
+            return crawledContent;
+        } catch (error) {
+            console.error('Error processing URLs:', error);
+            return content;
+        }
+    }
+
     public async stream(
         messages: CoreMessage[],
         requestType: StreamRequestType,
@@ -67,6 +105,10 @@ class LlmManager {
             skipSystemPrompt?: boolean;
         },
     ): Promise<CompletedStreamResponse> {
+        const lastUserMessage = messages.findLast((m) => m.role === 'user');
+        if (lastUserMessage && typeof lastUserMessage.content === 'string') {
+            lastUserMessage.content = await this.processUrls(lastUserMessage.content);
+        }
         const { abortController, skipSystemPrompt } = options || {};
         this.abortController = abortController || new AbortController();
         try {

diff --git a/apps/studio/package.json b/apps/studio/package.json
@@ -53,6 +53,7 @@
         "@emotion/react": "^11.13.3",
         "@emotion/styled": "^11.13.0",
         "@fontsource-variable/inter": "^5.1.0",
+        "@mendable/firecrawl-js": "^1.24.0",
         "@onlook/foundation": "*",
         "@onlook/supabase": "*",
         "@onlook/ui": "*",

diff --git a/packages/ai/src/tools/crawler.ts b/packages/ai/src/tools/crawler.ts
@@ -0,0 +1,105 @@
+import FirecrawlApp from '@mendable/firecrawl-js';
+
+export interface CrawlOptions {
+    limit?: number;
+    scrapeOptions?: {
+        formats?: (
+            | 'markdown'
+            | 'html'
+            | 'rawHtml'
+            | 'content'
+            | 'links'
+            | 'screenshot'
+            | 'screenshot@fullPage'
+            | 'extract'
+            | 'json'
+            | 'changeTracking'
+        )[];
+    };
+}
+
+export interface CrawlerResponse {
+    success: boolean;
+    error?: string;
+    data: Array<{
+        html?: string;
+        markdown?: string;
+    }>;
+}
+
+export interface CrawledContent {
+    markdown?: string;
+    html?: string;
+}
+
+export function validateCrawlerResponse(response: unknown): response is CrawlerResponse {
+    if (!response || typeof response !== 'object') {
+        return false;
+    }
+
+    if (!('success' in response) || typeof response.success !== 'boolean') {
+        return false;
+    }
+
+    if (!('data' in response) || !Array.isArray(response.data)) {
+        return false;
+    }
+
+    if (response.data.length === 0) {
+        return false;
+    }
+
+    const firstItem = response.data[0];
+    return (
+        typeof firstItem === 'object' &&
+        firstItem !== null &&
+        ('html' in firstItem || 'markdown' in firstItem) &&
+        (firstItem.html === undefined || typeof firstItem.html === 'string') &&
+        (firstItem.markdown === undefined || typeof firstItem.markdown === 'string')
+    );
+}
+
+export class CrawlerService {
+    private static instance: CrawlerService;
+
+    private app: FirecrawlApp;
+
+    private constructor() {
+        const apiKey = import.meta.env.VITE_FIRECRAWL_API_KEY;
+        if (!apiKey) {
+            throw new Error(
+                'VITE_FIRECRAWL_API_KEY is not defined. Please provide a valid API key.',
+            );
+        }
+        this.app = new FirecrawlApp({ apiKey });
+    }
+
+    static getInstance(): CrawlerService {
+        if (!this.instance) {
+            this.instance = new CrawlerService();
+        }
+        return this.instance;
+    }
+
+    async crawlUrl(
+        url: string,
+        options: CrawlOptions = {
+            limit: 100,
+            scrapeOptions: {
+                formats: ['markdown', 'html'],
+            },
+        },
+    ) {
+        try {
+            const response = await this.app.crawlUrl(url, options);
+
+            if (!response.success) {
+                throw new Error(`Failed to crawl: ${response.error}`);
+            }
+            return response;
+        } catch (error) {
+            console.error('Error during crawling:', error);
+            throw error;
+        }
+    }
+}
diff --git a/packages/ai/src/tools/helpers.ts b/packages/ai/src/tools/helpers.ts
@@ -40,3 +40,40 @@ export async function getAllFiles(
         return { success: false, error: error instanceof Error ? error.message : 'Unknown error' };
     }
 }
+
+export function extractUrls(text: string): string[] {
+    // Regular URLs with http/https
+    const httpPattern =
+        /https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi;
+
+    // URLs starting with www
+    const wwwPattern =
+        /(?:^|\s)www\.[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)/gi;
+
+    // Markdown links [text](url)
+    const markdownPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
+
+    // Extract all types of URLs
+    const httpUrls = text.match(httpPattern) || [];
+    const wwwUrls = (text.match(wwwPattern) || []).map((url) => 'https://' + url.trim());
+    const markdownUrls = Array.from(text.matchAll(markdownPattern), (match) => match[2]);
+
+    // Combine all URLs and remove duplicates
+    const allUrls = [...httpUrls, ...wwwUrls, ...markdownUrls];
+
+    // Validate URLs and return unique ones
+    return Array.from(
+        new Set(
+            allUrls.filter((url) => {
+                try {
+                    // Add https:// prefix if missing
+                    const fullUrl = url.startsWith('http') ? url : `https://${url}`;
+                    new URL(fullUrl);
+                    return fullUrl;
+                } catch {
+                    return false;
+                }
+            }),
+        ),
+    );
+}
diff --git a/packages/ai/src/tools/index.ts b/packages/ai/src/tools/index.ts
@@ -4,6 +4,7 @@ import { readFile } from 'fs/promises';
 import { z } from 'zod';
 import { ONLOOK_PROMPT } from '../prompt/onlook';
 import { getAllFiles } from './helpers';
+import { CrawlerService } from './crawler';
 
 export const listFilesTool = tool({
     description: 'List all files in the current directory, including subdirectories',
@@ -130,8 +131,65 @@ export const getStrReplaceEditorTool = (handlers: FileOperationHandlers) => {
     return strReplaceEditorTool;
 };
 
+export const crawlUrlTool = tool({
+    description: 'Crawl webpage content from provided URL',
+    parameters: z.object({
+        urls: z.array(z.string()).describe('Array of URLs to crawl'),
+        options: z
+            .object({
+                limit: z.number().optional(),
+                scrapeOptions: z
+                    .object({
+                        formats: z
+                            .array(
+                                z.enum([
+                                    'markdown',
+                                    'html',
+                                    'rawHtml',
+                                    'content',
+                                    'links',
+                                    'screenshot',
+                                    'screenshot@fullPage',
+                                    'extract',
+                                    'json',
+                                    'changeTracking',
+                                ]),
+                            )
+                            .optional(),
+                    })
+                    .optional(),
+            })
+            .optional(),
+    }),
+    execute: async ({ urls, options }) => {
+        try {
+            const crawler = CrawlerService.getInstance();
+            const results = await Promise.all(
+                urls.map(async (url) => {
+                    try {
+                        const result = await crawler.crawlUrl(url, options);
+                        if (!result.success) {
+                            return { url, error: result.status };
+                        }
+                        return { url, data: result.data[0] };
+                    } catch (error) {
+                        return {
+                            url,
+                            error: error instanceof Error ? error.message : 'Unknown error',
+                        };
+                    }
+                }),
+            );
+            return results;
+        } catch (error) {
+            return `Error: ${error instanceof Error ? error.message : 'Unknown error'}`;
+        }
+    },
+});
+
 export const chatToolSet: ToolSet = {
     list_files: listFilesTool,
     read_files: readFilesTool,
     onlook_instructions: onlookInstructionsTool,
+    crawl_url: crawlUrlTool,
 };
diff --git a/packages/models/src/chat/message/context.ts b/packages/models/src/chat/message/context.ts
@@ -4,6 +4,7 @@ export enum MessageContextType {
     IMAGE = 'image',
     ERROR = 'error',
     PROJECT = 'project',
+    LINK = 'link',
 }
 
 type BaseMessageContext = {
@@ -38,9 +39,15 @@ export type ProjectMessageContext = BaseMessageContext & {
     path: string;
 };
 
+export type LinkMessageContext = BaseMessageContext & {
+    type: MessageContextType.LINK;
+    url: string;
+};
+
 export type ChatMessageContext =
     | FileMessageContext
     | HighlightMessageContext
     | ImageMessageContext
     | ErrorMessageContext
-    | ProjectMessageContext;
+    | ProjectMessageContext
+    | LinkMessageContext;