1// Submit a URL for scraping
2const response = await fetch('https://sentient-api.mediathrive.com/api/scrape', {
3 method: 'POST',
4 headers: {
5 'Content-Type': 'application/json'
6 },
7 body: JSON.stringify({
8 url: 'https://example.com/article',
9 fetchOptions: {
10 forceStrategy: 'http',
11 timeout: 30000,
12 additionalWaitMs: 1000
13 }
14 })
15});
16
17// Get job ID from response
18const { jobId } = await response.json();
19
20// Retrieve the scrape results
21const result = await fetch(`https://sentient-api.mediathrive.com/api/scrape/${jobId}`);
22const data = await result.json();
23
24// Access the extracted schema data
25console.log(`Detected schema: ${data.detected_schema_type}`);
26console.log(data.enriched_schema);Built for engineers who need reliable data extraction at scale
Our AI automatically detects content types and extracts them into schema.org compatible formats with support for 15+ primary schema types.
1// GET /api/scrape/:id response
2{
3 "id": "12345678-1234-1234-1234-123456789012",
4 "url": "https://example.com/article",
5 "status": "completed",
6 "detected_schema_type": "Article",
7 "detected_schema_confidence": 0.95,
8 "enriched_schema": {
9 "@type": "Article",
10 "headline": "Example Article",
11 "author": "John Doe",
12 "datePublished": "2023-01-01T00:00:00Z",
13 "articleBody": "This is an example article..."
14 }
15}Complete RESTful API with endpoints for scrape job management, rule configuration, and application status monitoring.
1// Submit a scrape job
2POST /api/scrape
3{
4 "url": "https://example.com/page",
5 "fetchOptions": {
6 "forceStrategy": "http",
7 "timeout": 30000,
8 "additionalWaitMs": 1000,
9 "headers": {
10 "User-Agent": "Custom User Agent"
11 }
12 }
13}Smart caching system automatically detects when previously scraped content is requested and returns cached results for improved performance.
1// Cache hit response
2{
3 "message": "Scrape job submitted",
4 "jobId": "12345678-1234-1234-1234-123456789012",
5 "url": "https://example.com/page",
6 "status": "completed",
7 "cached": true,
8 "schemaType": "Article"
9}Powerful HITL workflow for reviewing and approving AI-generated extraction rules, ensuring the highest data quality for your most critical sources.
1// GET /api/hitl/rules/:id
2{
3 "ruleset": {
4 "id": "12345678-1234-1234-1234-123456789012",
5 "domain": "example.com",
6 "schema_type": "Article",
7 "rules": {
8 "title": {
9 "selector": "h1.title",
10 "type": "text"
11 },
12 "author": {
13 "selector": "span.author",
14 "type": "text"
15 }
16 },
17 "ai_confidence_score": 0.92
18 },
19 "sample": {
20 "url": "https://example.com/sample-page",
21 "extracted_schema": {
22 "@type": "Article",
23 "title": "Sample Article",
24 "author": "Jane Smith"
25 }
26 }
27}Flexible options for content retrieval including HTTP-based and headless browser strategies, with customizable timeouts and request headers.
1// POST /api/scrape with fetch options
2{
3 "url": "https://example.com/dynamic-page",
4 "fetchOptions": {
5 "forceStrategy": "playwright",
6 "timeout": 60000,
7 "additionalWaitMs": 2000,
8 "headers": {
9 "User-Agent": "Mozilla/5.0 ...",
10 "Accept-Language": "en-US,en;q=0.9"
11 }
12 }
13}Detailed metrics and status reporting for monitoring extraction performance, rule effectiveness, and application health.
1// GET /api/status/metrics (Prometheus format)
2# HELP sentient_jobs_total Total number of jobs processed
3# TYPE sentient_jobs_total counter
4sentient_jobs_total{status="completed",type="scrape"} 100
5sentient_jobs_total{status="failed",type="scrape"} 5
6
7# HELP sentient_schema_detection_confidence
8# TYPE sentient_schema_detection_confidence gauge
9sentient_schema_detection_confidence{schema="Article"} 0.92
10sentient_schema_detection_confidence{schema="Product"} 0.89How our API transforms web content into structured Schema.org data for various applications
Extract structured article data from news sites, blogs, and publications.
1// Article schema example (simplified)
2{
3 "id": "12345678-1234-1234-1234-123456789012",
4 "url": "https://example.com/article",
5 "status": "completed",
6 "schemaType": "NewsArticle",
7 "data": {
8 "@type": "NewsArticle",
9 "headline": "Breaking News: Important Event",
10 "author": "John Doe",
11 "datePublished": "2025-03-29T10:56:25+00:00",
12 "articleBody": "This is the main content of the article..."
13 }
14}Extract structured product information from e-commerce sites.
1// Product schema example (simplified)
2{
3 "schemaType": "Product",
4 "data": {
5 "@type": "Product",
6 "name": "Premium Wireless Headphones",
7 "description": "High-quality wireless headphones with noise cancellation",
8 "image": "https://example.com/images/headphones.jpg",
9 "brand": {
10 "@type": "Brand",
11 "name": "AudioTech"
12 },
13 "offers": {
14 "@type": "Offer",
15 "price": 129.99,
16 "priceCurrency": "USD",
17 "availability": "https://schema.org/InStock"
18 }
19 }
20}Extract detailed recipe information from food blogs and recipe sites.
1// Recipe schema example (simplified)
2{
3 "schemaType": "Recipe",
4 "data": {
5 "@type": "Recipe",
6 "name": "Chocolate Chip Cookies",
7 "recipeCategory": ["Dessert", "Baking"],
8 "recipeIngredient": [
9 "2 cups all-purpose flour",
10 "1 cup butter",
11 "1 cup chocolate chips"
12 ],
13 "recipeInstructions": [
14 "Preheat oven to 350°F",
15 "Mix ingredients",
16 "Bake for 12 minutes"
17 ],
18 "cookTime": "PT12M"
19 }
20}Extract event information from venue sites and ticketing platforms.
1// Event schema example (simplified)
2{
3 "schemaType": "Event",
4 "data": {
5 "@type": "Event",
6 "name": "Annual Tech Conference",
7 "startDate": "2025-06-15T09:00:00-07:00",
8 "endDate": "2025-06-17T17:00:00-07:00",
9 "location": {
10 "@type": "Place",
11 "name": "Convention Center",
12 "address": {
13 "@type": "PostalAddress",
14 "addressLocality": "San Francisco"
15 }
16 },
17 "performer": {
18 "@type": "Person",
19 "name": "Jane Smith"
20 }
21 }
22}Extract structured data about organizations and people.
1// Organization schema example (simplified)
2{
3 "schemaType": "Organization",
4 "data": {
5 "@type": "Organization",
6 "name": "Acme Corporation",
7 "description": "Leading provider of innovative solutions",
8 "url": "https://example.com",
9 "logo": "https://example.com/logo.png",
10 "address": {
11 "@type": "PostalAddress",
12 "streetAddress": "123 Main St",
13 "addressLocality": "San Francisco"
14 },
15 "telephone": "+1-555-123-4567",
16 "sameAs": [
17 "https://twitter.com/acmecorp",
18 "https://linkedin.com/company/acmecorp"
19 ]
20 }
21}Extract comprehensive metadata from web pages for organization and indexing.
1// WebPage schema example (simplified)
2{
3 "id": "7874a552-8c19-4bbb-8d3b-132889c1a8f0",
4 "url": "https://example.com/page",
5 "status": "completed",
6 "schemaType": "WebPage",
7 "data": {
8 "@type": "WebPage",
9 "name": "Page Title - Example Site",
10 "headline": "Main Headline of the Page",
11 "description": "This is the page description...",
12 "datePublished": "2025-03-29T10:56:25+00:00",
13 "dateModified": "2025-03-29T10:56:25+00:00",
14 "breadcrumb": {
15 "@id": "https://example.com/page#breadcrumb"
16 },
17 "inLanguage": "en-US"
18 }
19}Join a community of media professionals building the next generation of tools. Get free access to premium features and shape the future of the industry.
Connect with industry professionals
Track mentions and coverage in real-time
Manage your media presence efficiently
Work smarter with intelligent tools
Be part of the next media revolution
Can't find what you're looking for? Reach out to our support team — we typically reply within 2 business hours.