Files
ollama/app/tools/browser_crawl.go
Daniel Hiltgen d3b4b9970a app: add code for macOS and Windows apps under 'app' (#12933)
* app: add code for macOS and Windows apps under 'app'

* app: add readme

* app: windows and linux only for now

* ci: fix ui CI validation

---------

Co-authored-by: jmorganca <jmorganca@gmail.com>
2025-11-04 11:40:17 -08:00

137 lines
3.7 KiB
Go

//go:build windows || darwin
package tools
import (
"context"
"encoding/json"
"fmt"
)
// CrawlContent represents the content of a crawled page
type CrawlContent struct {
Snippet string `json:"snippet"`
FullText string `json:"full_text"`
}
// CrawlExtras represents additional data from the crawl API
type CrawlExtras struct {
Links []CrawlLink `json:"links"`
}
// CrawlLink represents a link found on a crawled page
type CrawlLink struct {
URL string `json:"url"`
Href string `json:"href"`
Text string `json:"text"`
}
// CrawlResult represents a single crawl result
type CrawlResult struct {
Title string `json:"title"`
URL string `json:"url"`
Content CrawlContent `json:"content"`
Extras CrawlExtras `json:"extras"`
}
// CrawlResponse represents the complete response from the crawl API
type CrawlResponse struct {
Results map[string][]CrawlResult `json:"results"`
}
// BrowserCrawler tool for crawling web pages using ollama.com crawl API
type BrowserCrawler struct{}
func (g *BrowserCrawler) Name() string {
return "get_webpage"
}
func (g *BrowserCrawler) Description() string {
return "Crawl and extract text content from web pages"
}
func (g *BrowserCrawler) Prompt() string {
return `When you need to read content from web pages, use the get_webpage tool. Simply provide the URLs you want to read and I'll fetch their content for you.
For each URL, I'll extract the main text content in a readable format. If you need to discover links within those pages, set extract_links to true. If the user requires the latest information, set livecrawl to true.
Only use this tool when you need to access current web content. Make sure the URLs are valid and accessible. Do not use this tool for:
- Downloading files or media
- Accessing private/authenticated pages
- Scraping data at high volumes
Always check the returned content to ensure it's relevant before using it in your response.`
}
func (g *BrowserCrawler) Schema() map[string]any {
schemaBytes := []byte(`{
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of URLs to crawl and extract content from"
}
},
"required": ["urls"]
}`)
var schema map[string]any
if err := json.Unmarshal(schemaBytes, &schema); err != nil {
return nil
}
return schema
}
func (g *BrowserCrawler) Execute(ctx context.Context, args map[string]any) (*CrawlResponse, error) {
urlsRaw, ok := args["urls"].([]any)
if !ok {
return nil, fmt.Errorf("urls parameter is required and must be an array of strings")
}
urls := make([]string, 0, len(urlsRaw))
for _, u := range urlsRaw {
if urlStr, ok := u.(string); ok {
urls = append(urls, urlStr)
}
}
if len(urls) == 0 {
return nil, fmt.Errorf("at least one URL is required")
}
return g.performWebCrawl(ctx, urls)
}
// performWebCrawl handles the actual HTTP request to ollama.com crawl API
func (g *BrowserCrawler) performWebCrawl(ctx context.Context, urls []string) (*CrawlResponse, error) {
result := &CrawlResponse{Results: make(map[string][]CrawlResult, len(urls))}
for _, targetURL := range urls {
fetchResp, err := performWebFetch(ctx, targetURL)
if err != nil {
return nil, fmt.Errorf("web_fetch failed for %q: %w", targetURL, err)
}
links := make([]CrawlLink, 0, len(fetchResp.Links))
for _, link := range fetchResp.Links {
links = append(links, CrawlLink{URL: link, Href: link})
}
snippet := truncateString(fetchResp.Content, 400)
result.Results[targetURL] = []CrawlResult{{
Title: fetchResp.Title,
URL: targetURL,
Content: CrawlContent{
Snippet: snippet,
FullText: fetchResp.Content,
},
Extras: CrawlExtras{Links: links},
}}
}
return result, nil
}