OCR & Document Processing API
API endpoints for extracting text and data from documents
OCR & Document Processing API
Extract text and structured data from documents using OCR and advanced document processing.
Extract Document
Extract text and data from a document immediately.
POST /documents/extract
Request Body
{
"documentUrl": "https://your-domain.com/document.pdf"
}Response
{
"success": true,
"data": {
"documentId": "doc_abc123",
"extractedText": "INVOICE\nInvoice Number: INV-2024-001...",
"confidence": 0.94,
"processingTime": "1.2s",
"pages": 3,
"language": "en",
"format": "pdf",
"metadata": {
"fileSize": 245760,
"dimensions": {
"width": 612,
"height": 792
}
}
}
}Parse Document with Template
Extract structured data from a document using template matching.
POST /documents/{id}/parse
Request Body
{
"documentUrl": "https://your-domain.com/invoice.pdf",
"templateId": "tpl_abc123",
"includeConfidence": true,
"outputFormat": "structured"
}Response
{
"success": true,
"data": {
"documentId": "doc_abc123",
"templateId": "tpl_abc123",
"templateName": "Standard Invoice Template",
"extractedData": {
"invoice_number": "INV-2024-001",
"vendor_name": "Acme Corp",
"invoice_date": "2024-01-15",
"total_amount": 1250.00,
"line_items": [
{
"description": "Ocean Freight",
"quantity": 1,
"unit_price": 1000.00,
"total": 1000.00
}
]
},
"confidence": {
"overall": 0.94,
"fields": {
"invoice_number": 0.99,
"total_amount": 0.91,
"vendor_name": 0.96
}
},
"processingTime": "2.1s",
"status": "completed"
}
}Submit Advanced Processing Job
Submit a document for advanced processing with enhanced OCR capabilities.
POST /documents/advanced-processing
Request Body
{
"documentId": "doc_abc123",
"documentUrl": "https://your-domain.com/complex-document.pdf",
"features": ["TABLES", "FORMS", "QUERIES"],
"queries": [
{
"text": "What is the total amount?",
"alias": "TOTAL_AMOUNT"
}
]
}Response
{
"success": true,
"data": {
"jobId": "job_xyz789",
"status": "IN_PROGRESS",
"documentId": "doc_abc123",
"features": ["TABLES", "FORMS", "QUERIES"],
"submittedAt": "2024-01-15T10:30:00Z",
"estimatedCompletion": "2024-01-15T10:32:00Z"
}
}Get Processing Job Status
Check the status of an advanced processing job.
GET /documents/jobs/{jobId}
Response
{
"success": true,
"data": {
"jobId": "job_xyz789",
"status": "SUCCEEDED",
"documentId": "doc_abc123",
"submittedAt": "2024-01-15T10:30:00Z",
"completedAt": "2024-01-15T10:31:45Z",
"processingTime": "1m 45s",
"pagesProcessed": 5,
"features": ["TABLES", "FORMS", "QUERIES"],
"warnings": [],
"nextToken": null
}
}Get Processing Job Results
Retrieve the results from a completed processing job.
GET /documents/jobs/{jobId}/results
Query Parameters
| Parameter | Type | Description |
|---|---|---|
format | string | Output format (json, csv) |
includeGeometry | boolean | Include coordinate information |
maxResults | integer | Maximum number of results |
nextToken | string | Pagination token |
Response
{
"success": true,
"data": {
"jobId": "job_xyz789",
"status": "SUCCEEDED",
"documentMetadata": {
"pages": 5,
"format": "PDF",
"fileSize": 1048576
},
"blocks": [
{
"blockType": "LINE",
"id": "block_001",
"text": "INVOICE",
"confidence": 99.12,
"geometry": {
"boundingBox": {
"width": 0.15,
"height": 0.04,
"left": 0.42,
"top": 0.08
}
},
"page": 1
}
],
"tables": [
{
"id": "table_001",
"rows": 5,
"columns": 4,
"cells": [
{
"rowIndex": 0,
"columnIndex": 0,
"text": "Description",
"confidence": 98.5
}
],
"geometry": {
"boundingBox": {
"width": 0.8,
"height": 0.3,
"left": 0.1,
"top": 0.4
}
}
}
],
"forms": [
{
"id": "form_001",
"keyValuePairs": [
{
"key": {
"text": "Invoice Number:",
"confidence": 97.8
},
"value": {
"text": "INV-2024-001",
"confidence": 99.1
}
}
]
}
],
"queries": [
{
"text": "What is the total amount?",
"alias": "TOTAL_AMOUNT",
"answer": {
"text": "$1,250.00",
"confidence": 96.3
}
}
],
"nextToken": null
}
}Get Document Layout
Retrieve document layout and structure information.
GET /documents/layout
Query Parameters
| Parameter | Type | Required | Description |
|---|---|---|---|
documentUrl | string | Yes | URL of the document to analyze |
includeText | boolean | No | Include extracted text |
includeImages | boolean | No | Include image regions |
Response
{
"success": true,
"data": {
"documentId": "doc_abc123",
"pages": [
{
"pageNumber": 1,
"width": 612,
"height": 792,
"orientation": "portrait",
"textBlocks": [
{
"id": "text_001",
"text": "INVOICE",
"confidence": 0.99,
"boundingBox": {
"left": 250,
"top": 50,
"width": 112,
"height": 24
},
"fontSize": 18,
"fontStyle": "bold"
}
],
"tables": [
{
"id": "table_001",
"rows": 5,
"columns": 4,
"boundingBox": {
"left": 50,
"top": 200,
"width": 500,
"height": 120
},
"confidence": 0.96
}
],
"images": [
{
"id": "image_001",
"type": "logo",
"boundingBox": {
"left": 50,
"top": 50,
"width": 100,
"height": 50
}
}
]
}
],
"metadata": {
"totalPages": 1,
"language": "en",
"documentType": "invoice",
"confidence": 0.94
}
}
}Batch Process Documents
Process multiple documents in a single request.
POST /documents/batch-process
Request Body
{
"documents": [
{
"id": "doc_001",
"url": "https://your-domain.com/doc1.pdf"
},
{
"id": "doc_002",
"url": "https://your-domain.com/doc2.pdf"
}
],
"processingType": "advanced",
"features": ["TABLES", "FORMS"],
"callbackUrl": "https://your-app.com/webhook"
}Response
{
"success": true,
"data": {
"batchId": "batch_xyz789",
"status": "PROCESSING",
"documentsCount": 2,
"submittedAt": "2024-01-15T10:30:00Z",
"estimatedCompletion": "2024-01-15T10:35:00Z",
"jobs": [
{
"documentId": "doc_001",
"jobId": "job_001",
"status": "IN_PROGRESS"
},
{
"documentId": "doc_002",
"jobId": "job_002",
"status": "QUEUED"
}
]
}
}Get Batch Status
Check the status of a batch processing job.
GET /documents/batch/{batchId}
Response
{
"success": true,
"data": {
"batchId": "batch_xyz789",
"status": "COMPLETED",
"documentsCount": 2,
"completedCount": 2,
"failedCount": 0,
"submittedAt": "2024-01-15T10:30:00Z",
"completedAt": "2024-01-15T10:34:22Z",
"jobs": [
{
"documentId": "doc_001",
"jobId": "job_001",
"status": "SUCCEEDED",
"processingTime": "2.1s"
},
{
"documentId": "doc_002",
"jobId": "job_002",
"status": "SUCCEEDED",
"processingTime": "1.8s"
}
]
}
}Document Analysis
Get detailed analysis of document structure and content.
POST /documents/{id}/analyze
Request Body
{
"analysisTypes": ["layout", "language", "quality", "classification"],
"includeMetrics": true
}Response
{
"success": true,
"data": {
"documentId": "doc_abc123",
"analysis": {
"layout": {
"type": "structured",
"hasHeader": true,
"hasFooter": true,
"columnCount": 1,
"tableCount": 2
},
"language": {
"primary": "en",
"confidence": 0.98,
"detected": ["en"]
},
"quality": {
"score": 0.92,
"resolution": "300dpi",
"clarity": "good",
"skew": 0.2,
"issues": []
},
"classification": {
"documentType": "invoice",
"confidence": 0.89,
"category": "financial",
"subtype": "commercial_invoice"
}
},
"metrics": {
"textCoverage": 0.75,
"processingTime": "1.5s",
"complexity": "medium"
}
}
}Processing Options
OCR Features
TEXT- Basic text extractionTABLES- Table structure recognitionFORMS- Form field extractionQUERIES- Query-based extractionSIGNATURES- Signature detection
Output Formats
json- Structured JSON responsecsv- Comma-separated valuesxml- XML formattext- Plain text extraction
Quality Settings
speed- Fast processing, lower accuracybalanced- Balance of speed and accuracyaccuracy- Highest accuracy, slower processing
Error Codes
| Code | Description |
|---|---|
DOCUMENT_NOT_FOUND | Document ID doesn't exist |
INVALID_DOCUMENT_URL | Document URL is not accessible |
UNSUPPORTED_FORMAT | Document format not supported |
PROCESSING_FAILED | Document processing failed |
JOB_NOT_FOUND | Processing job ID doesn't exist |
JOB_IN_PROGRESS | Job is still processing |
DOCUMENT_TOO_LARGE | Document exceeds size limits |
INVALID_TEMPLATE | Template cannot be applied to document |
EXTRACTION_TIMEOUT | Processing exceeded time limit |