Models Reference
Core Models
Document
The top-level container for extracted content.
unifex.base.Document
Bases: _DocumentSearchMixin, BaseModel
Source code in unifex/base/models.py
| class Document(_DocumentSearchMixin, BaseModel):
path: Path
pages: list[Page] = Field(default_factory=list)
metadata: ExtractorMetadata | None = None
class Config:
arbitrary_types_allowed = True
|
Page
Represents a single page with text blocks and tables.
unifex.base.Page
Bases: BaseModel
Source code in unifex/base/models.py
| class Page(BaseModel):
page: int
width: float
height: float
texts: list[TextBlock] = Field(default_factory=list)
tables: list[Table] = Field(default_factory=list)
coordinate_info: CoordinateInfo | None = None
def search(
self,
pattern: str | re.Pattern[str],
*,
case_sensitive: bool = True,
) -> list[TextBlock]:
"""Search for text blocks matching a pattern.
Args:
pattern: String for substring search, or compiled regex pattern.
case_sensitive: Whether search is case-sensitive (default True).
Ignored if pattern is already a compiled regex.
Returns:
List of matching TextBlock objects.
"""
if isinstance(pattern, re.Pattern):
compiled = pattern
else:
flags = 0 if case_sensitive else re.IGNORECASE
compiled = re.compile(re.escape(pattern), flags)
return [block for block in self.texts if compiled.search(block.text)]
|
search
search(
pattern: str | Pattern[str],
*,
case_sensitive: bool = True,
) -> list[TextBlock]
Search for text blocks matching a pattern.
Parameters:
| Name |
Type |
Description |
Default |
pattern
|
str | Pattern[str]
|
String for substring search, or compiled regex pattern.
|
required
|
case_sensitive
|
bool
|
Whether search is case-sensitive (default True).
Ignored if pattern is already a compiled regex.
|
True
|
Returns:
| Type |
Description |
list[TextBlock]
|
List of matching TextBlock objects.
|
Source code in unifex/base/models.py
| def search(
self,
pattern: str | re.Pattern[str],
*,
case_sensitive: bool = True,
) -> list[TextBlock]:
"""Search for text blocks matching a pattern.
Args:
pattern: String for substring search, or compiled regex pattern.
case_sensitive: Whether search is case-sensitive (default True).
Ignored if pattern is already a compiled regex.
Returns:
List of matching TextBlock objects.
"""
if isinstance(pattern, re.Pattern):
compiled = pattern
else:
flags = 0 if case_sensitive else re.IGNORECASE
compiled = re.compile(re.escape(pattern), flags)
return [block for block in self.texts if compiled.search(block.text)]
|
TextBlock
A text element with bounding box and confidence.
unifex.base.TextBlock
Bases: BaseModel
Source code in unifex/base/models.py
| class TextBlock(BaseModel):
text: str
bbox: BBox
rotation: float = 0.0
confidence: float | None = None
font_info: FontInfo | None = None
|
BBox
Bounding box coordinates.
unifex.base.BBox
Bases: BaseModel
Source code in unifex/base/models.py
| class BBox(BaseModel):
x0: float
y0: float
x1: float
y1: float
|
Table
Extracted table with rows and cells.
unifex.base.Table
Bases: BaseModel
A table extracted from a document page.
Source code in unifex/base/models.py
| class Table(BaseModel):
"""A table extracted from a document page."""
page: int # Page number (0-indexed)
cells: list[TableCell] = Field(default_factory=list)
row_count: int = 0
col_count: int = 0
bbox: BBox | None = None # Table bbox if available
def to_dataframe(self) -> pd.DataFrame:
"""Convert table to pandas DataFrame.
Returns:
DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
Missing cells are filled with empty strings.
Raises:
ImportError: If pandas is not installed.
"""
try:
import pandas as pd
except ImportError:
raise ImportError(
"pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
) from None
# Build 2D grid from sparse cell list
grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}
# Create rows as lists
data = []
for row_idx in range(self.row_count):
row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
data.append(row)
return pd.DataFrame(data)
|
to_dataframe
to_dataframe() -> pd.DataFrame
Convert table to pandas DataFrame.
Returns:
| Type |
Description |
DataFrame
|
DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
|
DataFrame
|
Missing cells are filled with empty strings.
|
Raises:
| Type |
Description |
ImportError
|
If pandas is not installed.
|
Source code in unifex/base/models.py
| def to_dataframe(self) -> pd.DataFrame:
"""Convert table to pandas DataFrame.
Returns:
DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
Missing cells are filled with empty strings.
Raises:
ImportError: If pandas is not installed.
"""
try:
import pandas as pd
except ImportError:
raise ImportError(
"pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
) from None
# Build 2D grid from sparse cell list
grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}
# Create rows as lists
data = []
for row_idx in range(self.row_count):
row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
data.append(row)
return pd.DataFrame(data)
|
TableCell
Individual cell within a table.
unifex.base.TableCell
Bases: BaseModel
A cell within a table.
Source code in unifex/base/models.py
| class TableCell(BaseModel):
"""A cell within a table."""
text: str
row: int
col: int
bbox: BBox | None = None # Cell bbox if available
|
Result Models
Result of document extraction.
Result of document extraction with all page results.
Contains the extracted document (successful pages only) and
detailed results for each requested page including any errors.
Source code in unifex/base/base.py
| @dataclass
class ExtractionResult:
"""Result of document extraction with all page results.
Contains the extracted document (successful pages only) and
detailed results for each requested page including any errors.
"""
document: Document
page_results: list[PageExtractionResult] = field(default_factory=list)
@property
def success(self) -> bool:
"""True if all requested pages were extracted successfully."""
return all(r.success for r in self.page_results)
@property
def failed_pages(self) -> list[PageExtractionResult]:
"""List of failed page extraction results."""
return [r for r in self.page_results if not r.success]
@property
def errors(self) -> list[tuple[int, str]]:
"""List of (page_number, error_message) for failed pages."""
return [(r.page.page, r.error or "") for r in self.page_results if not r.success]
|
True if all requested pages were extracted successfully.
failed_pages: list[PageExtractionResult]
List of failed page extraction results.
errors: list[tuple[int, str]]
List of (page_number, error_message) for failed pages.
Result of single page extraction.
Result of extracting a single page.
Source code in unifex/base/base.py
| @dataclass
class PageExtractionResult:
"""Result of extracting a single page."""
page: Page
success: bool
error: str | None = None
|
Metadata about the extractor used.
Bases: BaseModel
Source code in unifex/base/models.py
| class ExtractorMetadata(BaseModel):
extractor_type: ExtractorType
creator: str | None = None
producer: str | None = None
title: str | None = None
author: str | None = None
creation_date: str | None = None
modification_date: str | None = None
extra: dict[str, Any] = Field(default_factory=dict)
|
LLM Models
Result of single LLM extraction request.
Bases: BaseModel, Generic[T]
Result of LLM extraction for a single request.
Source code in unifex/llm/models.py
| class LLMExtractionResult(BaseModel, Generic[T]):
"""Result of LLM extraction for a single request."""
model_config = {"arbitrary_types_allowed": True}
data: T
model: str
provider: LLMProvider
usage: dict[str, int] | None = None
raw_response: Any | None = None
|
Result of parallel LLM extraction across multiple pages.
Bases: BaseModel, Generic[T]
Result of parallel LLM extraction across multiple pages.
Results are guaranteed to be in the same order as the input pages,
i.e., results[i] corresponds to pages[i] passed to the extraction function.
Source code in unifex/llm/models.py
| class LLMBatchExtractionResult(BaseModel, Generic[T]):
"""Result of parallel LLM extraction across multiple pages.
Results are guaranteed to be in the same order as the input pages,
i.e., results[i] corresponds to pages[i] passed to the extraction function.
"""
model_config = {"arbitrary_types_allowed": True}
results: list[PageExtractionResult[T]]
model: str
provider: LLMProvider
total_usage: dict[str, int] | None = None
|
Result for a single page in batch LLM processing.
Bases: BaseModel, Generic[T]
Result of extraction for a single page in batch processing.
Source code in unifex/llm/models.py
| class PageExtractionResult(BaseModel, Generic[T]):
"""Result of extraction for a single page in batch processing."""
model_config = {"arbitrary_types_allowed": True}
page: int
data: T | None = None
usage: dict[str, int] | None = None
error: str | None = None
|
LLMProvider
Supported LLM providers.
unifex.llm.models.LLMProvider
Bases: StrEnum
Supported LLM providers.
Source code in unifex/llm/models.py
| class LLMProvider(StrEnum):
"""Supported LLM providers."""
OPENAI = "openai"
ANTHROPIC = "anthropic"
GOOGLE = "google"
AZURE_OPENAI = "azure-openai"
|