Skip to content

Source Grounding Guide

Source grounding lets you track exactly where each piece of extracted information comes from in your source documents. This is crucial for building trust, debugging extraction issues, and meeting compliance requirements.

Source grounding records the exact text spans in your input documents that correspond to each extracted field.

Transparency

See exactly what text led to each extracted value

Trust

Verify extractions against source material

Debugging

Identify why certain values were extracted incorrectly

Compliance

Meet audit requirements for data lineage

Enable source grounding when creating your extractor:

from langstruct import LangStruct, Schema, Field
class PersonSchema(Schema):
name: str = Field(description="Full name of the person")
age: int = Field(description="Age in years")
location: str = Field(description="Current location")
# Enable source grounding
extractor = LangStruct(
schema=PersonSchema,
model="gemini-2.5-flash", # Fast and cost-effective
use_sources=True # Enable source tracking
)
text = "Dr. Sarah Johnson, 34, works at MIT in Cambridge, Massachusetts."
result = extractor.extract(text)
# Access source information
print("=== Extracted Data ===")
print(f"Name: {result.entities['name']}")
print(f"Age: {result.entities['age']}")
print(f"Location: {result.entities['location']}")
print("\n=== Source Locations ===")
for field, spans in result.sources.items():
for span in spans:
print(f"{field}: '{span.text}' (positions {span.start}-{span.end})")

Output:

=== Extracted Data ===
Name: Dr. Sarah Johnson
Age: 34
Location: Cambridge, Massachusetts
=== Source Locations ===
name: 'Dr. Sarah Johnson' (positions 0-16)
age: '34' (positions 18-20)
location: 'Cambridge, Massachusetts' (positions 39-62)

Source spans provide detailed information about text locations:

# Extract with source grounding
result = extractor.extract(text)
# Examine source spans in detail
for field_name, spans in result.sources.items():
print(f"\nField: {field_name}")
for i, span in enumerate(spans):
print(f" Span {i+1}:")
print(f" Text: '{span.text}'")
print(f" Start: {span.start}")
print(f" End: {span.end}")
print(f" Length: {len(span.text)}")
# Get surrounding context
context_start = max(0, span.start - 20)
context_end = min(len(text), span.end + 20)
context = text[context_start:context_end]
print(f" Context: '...{context}...'")

Some fields may have multiple source locations:

class PersonSchema(Schema):
name: str = Field(description="Full name")
skills: List[str] = Field(description="Technical skills")
text = """
John Smith is a developer. He knows Python, JavaScript, and Go.
John also has experience with React and Django frameworks.
"""
result = extractor.extract(text)
# Skills might have multiple source spans
for field, spans in result.sources.items():
print(f"\n{field}:")
for span in spans:
print(f" - '{span.text}' at {span.start}-{span.end}")

Each source span includes a confidence score:

# Access confidence for each source span
for field, spans in result.sources.items():
print(f"\n{field}:")
for span in spans:
print(f" Text: '{span.text}'")
print(f" Confidence: {span.confidence:.3f}")
print(f" Position: {span.start}-{span.end}")

Validate that sources contain the expected values:

def validate_sources(result):
"""Validate that source spans contain the extracted values"""
validation_errors = []
for field, value in result.entities.items():
if field not in result.sources:
validation_errors.append(f"No source found for field '{field}'")
continue
spans = result.sources[field]
if not spans:
validation_errors.append(f"Empty source spans for field '{field}'")
continue
# Check if any span contains the extracted value
found_match = False
for span in spans:
if str(value).lower() in span.text.lower():
found_match = True
break
if not found_match:
validation_errors.append(
f"Field '{field}' value '{value}' not found in source spans: {[s.text for s in spans]}"
)
return validation_errors
# Validate extraction sources
errors = validate_sources(result)
if errors:
print("Validation errors:")
for error in errors:
print(f" - {error}")
else:
print("All sources validated successfully!")

LangStruct provides advanced interactive HTML visualizations that make source grounding easy to explore:

# Generate interactive visualization
results = extractor.extract([
"Dr. Sarah Johnson, 34, works at MIT in Cambridge, Massachusetts.",
"Prof. John Miller, 42, teaches at Harvard University in Boston."
])
# Create advanced interactive HTML with source highlighting
extractor.visualize(results, "source_analysis.html",
title="Source Grounding Analysis")
print("🚀 Open source_analysis.html to explore:")
print(" • Click highlighted text spans to see extractions")
print(" • Filter by entity types")
print(" • Search across documents")
print(" • View character-level source positions")

Interactive Features:

  • Color-coded entity highlighting with consistent colors
  • Clickable source spans showing exact extraction locations
  • Real-time search across all text and entities
  • Entity type filtering with interactive chips
  • Dual view modes (document and list views)
  • Live statistics updating as you explore

For custom highlighting needs, you can also create manual visualizations:

def highlight_sources(text, sources, entities):
"""Create HTML with highlighted source spans"""
# Create list of all spans with their field names
all_spans = []
colors = ['yellow', 'lightblue', 'lightgreen', 'pink', 'orange']
color_map = {}
for field, spans in sources.items():
if field not in color_map:
color_map[field] = colors[len(color_map) % len(colors)]
for span in spans:
all_spans.append({
'start': span.start,
'end': span.end,
'field': field,
'text': span.text,
'value': entities[field],
'color': color_map[field]
})
# Sort spans by position
all_spans.sort(key=lambda x: x['start'])
# Build highlighted HTML
html = '<div style="font-family: monospace; line-height: 1.6;">'
last_pos = 0
for span in all_spans:
# Add text before this span
html += text[last_pos:span['start']]
# Add highlighted span
html += f'<span style="background-color: {span["color"]}; padding: 2px; border-radius: 3px;" title="{span["field"]}: {span["value"]}">{span["text"]}</span>'
last_pos = span['end']
# Add remaining text
html += text[last_pos:]
html += '</div>'
return html
# Generate highlighted HTML
highlighted = highlight_sources(text, result.sources, result.entities)
# Save to file or display in Jupyter
with open('highlighted.html', 'w') as f:
f.write(highlighted)

Source grounding helps meet regulatory requirements:

class ComplianceExtractor:
def __init__(self, schema):
self.extractor = LangStruct(
schema=schema,
use_sources=True,
model="gemini-2.5-flash" # Fast and reliable
)
def extract_with_audit_trail(self, document, document_id=None):
"""Extract with full audit trail"""
result = self.extractor.extract(document)
audit_record = {
'document_id': document_id,
'timestamp': datetime.utcnow().isoformat(),
'model_version': self.extractor.model,
'extracted_data': result.entities,
'confidence': result.confidence,
'source_spans': {
field: [
{
'text': span.text,
'start': span.start,
'end': span.end,
'confidence': span.confidence
}
for span in spans
]
for field, spans in result.sources.items()
}
}
return result, audit_record
# Usage for compliance
extractor = ComplianceExtractor(PersonSchema)
result, audit = extractor.extract_with_audit_trail(
document=text,
document_id="DOC-2024-001"
)
# Store audit trail in database
store_audit_record(audit)

Use source grounding for quality control:

class QualityChecker:
def __init__(self, min_confidence=0.8):
self.min_confidence = min_confidence
def check_extraction_quality(self, result):
"""Check extraction quality using source grounding"""
quality_report = {
'overall_confidence': result.confidence,
'field_quality': {},
'warnings': [],
'errors': []
}
for field, spans in result.sources.items():
if not spans:
quality_report['errors'].append(f"No source found for field '{field}'")
continue
avg_confidence = sum(span.confidence for span in spans) / len(spans)
quality_report['field_quality'][field] = {
'confidence': avg_confidence,
'span_count': len(spans),
'total_length': sum(len(span.text) for span in spans)
}
if avg_confidence < self.min_confidence:
quality_report['warnings'].append(
f"Low confidence for field '{field}': {avg_confidence:.3f}"
)
return quality_report
# Check quality
checker = QualityChecker(min_confidence=0.85)
quality = checker.check_extraction_quality(result)
print(f"Overall confidence: {quality['overall_confidence']:.3f}")
for warning in quality['warnings']:
print(f"Warning: {warning}")

Use source grounding to debug extraction problems:

def debug_extraction(result, expected_entities):
"""Debug extraction by comparing sources with expected values"""
print("=== Extraction Debug Report ===")
for field, expected_value in expected_entities.items():
actual_value = result.entities.get(field)
spans = result.sources.get(field, [])
print(f"\nField: {field}")
print(f"Expected: {expected_value}")
print(f"Actual: {actual_value}")
print(f"Match: {'' if actual_value == expected_value else ''}")
if spans:
print("Source spans:")
for i, span in enumerate(spans):
print(f" {i+1}. '{span.text}' (confidence: {span.confidence:.3f})")
else:
print("⚠️ No source spans found")
# Suggest improvements
if actual_value != expected_value:
print("💡 Suggestions:")
if not spans:
print(" - Check if field description is clear")
print(" - Verify the text contains the expected information")
else:
print(" - Review source spans for accuracy")
print(" - Consider improving the field description")
print(" - Add more training examples with similar patterns")
# Debug the extraction
expected = {
'name': 'Dr. Sarah Johnson',
'age': 34,
'location': 'Cambridge, Massachusetts'
}
debug_extraction(result, expected)

Always Enable in Production

Source grounding is essential for production reliability and debugging

Validate Source Spans

Implement validation to ensure sources match extracted values

Store Audit Trails

Keep source information for compliance and debugging purposes

Monitor Source Quality

Track source confidence and span quality over time

Source grounding has minimal performance impact:

import time
# Benchmark with and without source grounding
def benchmark_extraction():
text = "Sample text for extraction..."
iterations = 100
# Without source grounding
extractor_no_sources = LangStruct(schema=PersonSchema, use_sources=False)
start = time.time()
for _ in range(iterations):
extractor_no_sources.extract(text)
time_no_sources = time.time() - start
# With source grounding
extractor_with_sources = LangStruct(schema=PersonSchema, use_sources=True)
start = time.time()
for _ in range(iterations):
extractor_with_sources.extract(text)
time_with_sources = time.time() - start
print(f"Without sources: {time_no_sources:.3f}s")
print(f"With sources: {time_with_sources:.3f}s")
print(f"Overhead: {((time_with_sources - time_no_sources) / time_no_sources * 100):.1f}%")
benchmark_extraction()