Skip to content

Getting Started

LLM Access Needed

LangStruct works with any LLM provider — use free tier services, paid APIs, or even run models locally. Choose what works best for you:

Google Gemini

  • Free tier with generous limits
  • Fast and reliable
  • Get free key →
  • export GOOGLE_API_KEY="your-key"

OpenAI

  • Latest models (GPT-4o, o1)
  • Paid service
  • Get key →
  • export OPENAI_API_KEY="sk-your-key"

Anthropic

  • Claude models
  • Paid service
  • Get key →
  • export ANTHROPIC_API_KEY="sk-ant-key"

Local (Ollama)

Terminal window
pip install "langstruct[examples]"
from langstruct import LangStruct
# Define schema by example
extractor = LangStruct(example={
"company": "Apple Inc.",
"revenue": 125.3,
"quarter": "Q3 2024"
})
# Extract from text
text = "Apple reported $125.3B revenue in Q3 2024..."
result = extractor.extract(text)
print(result.entities)
# {'company': 'Apple Inc.', 'revenue': 125.3, 'quarter': 'Q3 2024'}
print(result.sources) # Character-level source tracking
# {'company': [CharSpan(0, 5, 'Apple')], ...}
# Boost accuracy with refinement
refined_result = extractor.extract(text, refine=True)
print(f"Confidence: {refined_result.confidence:.1%}") # Higher confidence
from langstruct import LangStruct
# Same instance for both extraction and parsing
ls = LangStruct(example={
"company": "Apple Inc.",
"revenue": 125.3,
"quarter": "Q3 2024"
})
# Parse natural language query
query = "Q3 2024 tech companies over $100B discussing AI"
result = ls.query(query)
print(result.semantic_terms)
# ['tech companies', 'AI', 'artificial intelligence']
print(result.structured_filters)
# {'quarter': 'Q3 2024', 'revenue': {'$gte': 100.0}}
from langstruct import LangStruct
from chromadb import Client
# 1. Single instance for both operations
ls = LangStruct(example={
"company": "Apple",
"revenue": 100.0,
"quarter": "Q3"
})
vector_db = Client().create_collection("docs")
# 2. Index documents with metadata
def index_document(text):
metadata = ls.extract(text).entities
vector_db.add(texts=[text], metadatas=[metadata])
# 3. Query with natural language
def search(query):
parsed = ls.query(query)
return vector_db.query(
query_texts=parsed.semantic_terms,
where=parsed.structured_filters,
n_results=5
)
# Usage
index_document("Apple reported $125B in Q3 2024...")
results = search("Q3 tech companies over $100B")
# Returns only Apple, not other Q3 mentions
# Better type inference from multiple examples
examples = [
{"name": "Alice", "age": 25, "skills": ["Python"]},
{"name": "Bob", "age": 35, "skills": ["JavaScript", "React"]}
]
extractor = LangStruct(examples=examples)
# Infers: name=str, age=int, skills=List[str]
from pydantic import BaseModel, Field
from typing import List, Optional
class CompanySchema(BaseModel):
name: str
revenue: float = Field(gt=0, description="Revenue in billions")
employees: Optional[int] = None
products: List[str] = []
extractor = LangStruct(schema=CompanySchema)
# Default: Auto-detects available models
extractor = LangStruct(example=schema)
# Specific model
extractor = LangStruct(
example=schema,
model="gpt-5-mini" # Example latest OpenAI model
)
# Local with Ollama
extractor = LangStruct(
example=schema,
model="ollama/llama3.2"
)
# Process multiple documents efficiently
documents = [doc1, doc2, doc3, ...]
results = extractor.extract(documents, max_workers=8, show_progress=True)
for result in results:
print(f"Confidence: {result.confidence:.1%}")
print(f"Entities: {result.entities}")
# Batch processing with refinement for higher accuracy
results = extractor.extract(
documents,
refine=True,
max_workers=5,
rate_limit=60, # calls per minute
retry_failed=True # raise on failures (False to skip with warnings)
)
# Note: 2-5x higher cost but significantly better accuracy
result = extractor.extract(text)
# See exactly where each value came from
for field, spans in result.sources.items():
for span in spans:
print(f"{field}: '{text[span.start:span.end]}' at {span.start}-{span.end}")
# Save an extractor (preserves all state)
extractor.save("./my_extractor")
# Load anywhere (API keys must be available)
loaded_extractor = LangStruct.load("./my_extractor")
# Works exactly like the original
result = loaded_extractor.extract("New text")
# Save individual result
result.save_json("output.json")
# Export batch results
extractor.export_batch(results, "output.csv") # csv/json/excel/parquet
# JSONL round‑trip
extractor.save_annotated_documents(results, "extractions.jsonl")
loaded = extractor.load_annotated_documents("extractions.jsonl")
extractor.visualize(loaded, "results.html")
  • Choose models that fit your speed/cost/quality needs
  • Batch documents with max_workers; respect quotas with rate_limit
  • Use refinement for accuracy‑sensitive workloads (higher cost)