Skip to content

Models Reference

Core Models

Document

The top-level container for extracted content.

unifex.base.Document

Bases: _DocumentSearchMixin, BaseModel

Source code in unifex/base/models.py
class Document(_DocumentSearchMixin, BaseModel):
    path: Path
    pages: list[Page] = Field(default_factory=list)
    metadata: ExtractorMetadata | None = None

    class Config:
        arbitrary_types_allowed = True

Page

Represents a single page with text blocks and tables.

unifex.base.Page

Bases: BaseModel

Source code in unifex/base/models.py
class Page(BaseModel):
    page: int
    width: float
    height: float
    texts: list[TextBlock] = Field(default_factory=list)
    tables: list[Table] = Field(default_factory=list)
    coordinate_info: CoordinateInfo | None = None

    def search(
        self,
        pattern: str | re.Pattern[str],
        *,
        case_sensitive: bool = True,
    ) -> list[TextBlock]:
        """Search for text blocks matching a pattern.

        Args:
            pattern: String for substring search, or compiled regex pattern.
            case_sensitive: Whether search is case-sensitive (default True).
                           Ignored if pattern is already a compiled regex.

        Returns:
            List of matching TextBlock objects.
        """
        if isinstance(pattern, re.Pattern):
            compiled = pattern
        else:
            flags = 0 if case_sensitive else re.IGNORECASE
            compiled = re.compile(re.escape(pattern), flags)

        return [block for block in self.texts if compiled.search(block.text)]

search

search(
    pattern: str | Pattern[str],
    *,
    case_sensitive: bool = True,
) -> list[TextBlock]

Search for text blocks matching a pattern.

Parameters:

Name Type Description Default
pattern str | Pattern[str]

String for substring search, or compiled regex pattern.

required
case_sensitive bool

Whether search is case-sensitive (default True). Ignored if pattern is already a compiled regex.

True

Returns:

Type Description
list[TextBlock]

List of matching TextBlock objects.

Source code in unifex/base/models.py
def search(
    self,
    pattern: str | re.Pattern[str],
    *,
    case_sensitive: bool = True,
) -> list[TextBlock]:
    """Search for text blocks matching a pattern.

    Args:
        pattern: String for substring search, or compiled regex pattern.
        case_sensitive: Whether search is case-sensitive (default True).
                       Ignored if pattern is already a compiled regex.

    Returns:
        List of matching TextBlock objects.
    """
    if isinstance(pattern, re.Pattern):
        compiled = pattern
    else:
        flags = 0 if case_sensitive else re.IGNORECASE
        compiled = re.compile(re.escape(pattern), flags)

    return [block for block in self.texts if compiled.search(block.text)]

TextBlock

A text element with bounding box and confidence.

unifex.base.TextBlock

Bases: BaseModel

Source code in unifex/base/models.py
class TextBlock(BaseModel):
    text: str
    bbox: BBox
    rotation: float = 0.0
    confidence: float | None = None
    font_info: FontInfo | None = None

BBox

Bounding box coordinates.

unifex.base.BBox

Bases: BaseModel

Source code in unifex/base/models.py
class BBox(BaseModel):
    x0: float
    y0: float
    x1: float
    y1: float

Table

Extracted table with rows and cells.

unifex.base.Table

Bases: BaseModel

A table extracted from a document page.

Source code in unifex/base/models.py
class Table(BaseModel):
    """A table extracted from a document page."""

    page: int  # Page number (0-indexed)
    cells: list[TableCell] = Field(default_factory=list)
    row_count: int = 0
    col_count: int = 0
    bbox: BBox | None = None  # Table bbox if available

    def to_dataframe(self) -> pd.DataFrame:
        """Convert table to pandas DataFrame.

        Returns:
            DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
            Missing cells are filled with empty strings.

        Raises:
            ImportError: If pandas is not installed.
        """
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
            ) from None

        # Build 2D grid from sparse cell list
        grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}

        # Create rows as lists
        data = []
        for row_idx in range(self.row_count):
            row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
            data.append(row)

        return pd.DataFrame(data)

to_dataframe

to_dataframe() -> pd.DataFrame

Convert table to pandas DataFrame.

Returns:

Type Description
DataFrame

DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...

DataFrame

Missing cells are filled with empty strings.

Raises:

Type Description
ImportError

If pandas is not installed.

Source code in unifex/base/models.py
def to_dataframe(self) -> pd.DataFrame:
    """Convert table to pandas DataFrame.

    Returns:
        DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
        Missing cells are filled with empty strings.

    Raises:
        ImportError: If pandas is not installed.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
        ) from None

    # Build 2D grid from sparse cell list
    grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}

    # Create rows as lists
    data = []
    for row_idx in range(self.row_count):
        row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
        data.append(row)

    return pd.DataFrame(data)

TableCell

Individual cell within a table.

unifex.base.TableCell

Bases: BaseModel

A cell within a table.

Source code in unifex/base/models.py
class TableCell(BaseModel):
    """A cell within a table."""

    text: str
    row: int
    col: int
    bbox: BBox | None = None  # Cell bbox if available

Result Models

ExtractionResult

Result of document extraction.

unifex.base.ExtractionResult dataclass

Result of document extraction with all page results.

Contains the extracted document (successful pages only) and detailed results for each requested page including any errors.

Source code in unifex/base/base.py
@dataclass
class ExtractionResult:
    """Result of document extraction with all page results.

    Contains the extracted document (successful pages only) and
    detailed results for each requested page including any errors.
    """

    document: Document
    page_results: list[PageExtractionResult] = field(default_factory=list)

    @property
    def success(self) -> bool:
        """True if all requested pages were extracted successfully."""
        return all(r.success for r in self.page_results)

    @property
    def failed_pages(self) -> list[PageExtractionResult]:
        """List of failed page extraction results."""
        return [r for r in self.page_results if not r.success]

    @property
    def errors(self) -> list[tuple[int, str]]:
        """List of (page_number, error_message) for failed pages."""
        return [(r.page.page, r.error or "") for r in self.page_results if not r.success]

success property

success: bool

True if all requested pages were extracted successfully.

failed_pages property

failed_pages: list[PageExtractionResult]

List of failed page extraction results.

errors property

errors: list[tuple[int, str]]

List of (page_number, error_message) for failed pages.

PageExtractionResult

Result of single page extraction.

unifex.base.PageExtractionResult dataclass

Result of extracting a single page.

Source code in unifex/base/base.py
@dataclass
class PageExtractionResult:
    """Result of extracting a single page."""

    page: Page
    success: bool
    error: str | None = None

Metadata

ExtractorMetadata

Metadata about the extractor used.

unifex.base.ExtractorMetadata

Bases: BaseModel

Source code in unifex/base/models.py
class ExtractorMetadata(BaseModel):
    extractor_type: ExtractorType
    creator: str | None = None
    producer: str | None = None
    title: str | None = None
    author: str | None = None
    creation_date: str | None = None
    modification_date: str | None = None
    extra: dict[str, Any] = Field(default_factory=dict)

LLM Models

LLMExtractionResult

Result of single LLM extraction request.

unifex.llm.models.LLMExtractionResult

Bases: BaseModel, Generic[T]

Result of LLM extraction for a single request.

Source code in unifex/llm/models.py
class LLMExtractionResult(BaseModel, Generic[T]):
    """Result of LLM extraction for a single request."""

    model_config = {"arbitrary_types_allowed": True}

    data: T
    model: str
    provider: LLMProvider
    usage: dict[str, int] | None = None
    raw_response: Any | None = None

LLMBatchExtractionResult

Result of parallel LLM extraction across multiple pages.

unifex.llm.models.LLMBatchExtractionResult

Bases: BaseModel, Generic[T]

Result of parallel LLM extraction across multiple pages.

Results are guaranteed to be in the same order as the input pages, i.e., results[i] corresponds to pages[i] passed to the extraction function.

Source code in unifex/llm/models.py
class LLMBatchExtractionResult(BaseModel, Generic[T]):
    """Result of parallel LLM extraction across multiple pages.

    Results are guaranteed to be in the same order as the input pages,
    i.e., results[i] corresponds to pages[i] passed to the extraction function.
    """

    model_config = {"arbitrary_types_allowed": True}

    results: list[PageExtractionResult[T]]
    model: str
    provider: LLMProvider
    total_usage: dict[str, int] | None = None

PageExtractionResult (LLM)

Result for a single page in batch LLM processing.

unifex.llm.models.PageExtractionResult

Bases: BaseModel, Generic[T]

Result of extraction for a single page in batch processing.

Source code in unifex/llm/models.py
class PageExtractionResult(BaseModel, Generic[T]):
    """Result of extraction for a single page in batch processing."""

    model_config = {"arbitrary_types_allowed": True}

    page: int
    data: T | None = None
    usage: dict[str, int] | None = None
    error: str | None = None

LLMProvider

Supported LLM providers.

unifex.llm.models.LLMProvider

Bases: StrEnum

Supported LLM providers.

Source code in unifex/llm/models.py
class LLMProvider(StrEnum):
    """Supported LLM providers."""

    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    GOOGLE = "google"
    AZURE_OPENAI = "azure-openai"