Models Reference¶

Core Models¶

Document¶

The top-level container for extracted content.

unifex.base.Document ¶

Bases: _DocumentSearchMixin, BaseModel

Source code in unifex/base/models.py

class Document(_DocumentSearchMixin, BaseModel):
    path: Path
    pages: list[Page] = Field(default_factory=list)
    metadata: ExtractorMetadata | None = None

    class Config:
        arbitrary_types_allowed = True

Page¶

Represents a single page with text blocks and tables.

unifex.base.Page ¶

Bases: BaseModel

Source code in unifex/base/models.py

class Page(BaseModel):
    page: int
    width: float
    height: float
    texts: list[TextBlock] = Field(default_factory=list)
    tables: list[Table] = Field(default_factory=list)
    coordinate_info: CoordinateInfo | None = None

    def search(
        self,
        pattern: str | re.Pattern[str],
        *,
        case_sensitive: bool = True,
    ) -> list[TextBlock]:
        """Search for text blocks matching a pattern.

        Args:
            pattern: String for substring search, or compiled regex pattern.
            case_sensitive: Whether search is case-sensitive (default True).
                           Ignored if pattern is already a compiled regex.

        Returns:
            List of matching TextBlock objects.
        """
        if isinstance(pattern, re.Pattern):
            compiled = pattern
        else:
            flags = 0 if case_sensitive else re.IGNORECASE
            compiled = re.compile(re.escape(pattern), flags)

        return [block for block in self.texts if compiled.search(block.text)]

search ¶

search(
    pattern: str | Pattern[str],
    *,
    case_sensitive: bool = True,
) -> list[TextBlock]

Search for text blocks matching a pattern.

Parameters:

Name	Type	Description	Default
`pattern`	`str \| Pattern[str]`	String for substring search, or compiled regex pattern.	required
`case_sensitive`	`bool`	Whether search is case-sensitive (default True). Ignored if pattern is already a compiled regex.	`True`

Returns:

Type	Description
`list[TextBlock]`	List of matching TextBlock objects.

Source code in unifex/base/models.py

def search(
    self,
    pattern: str | re.Pattern[str],
    *,
    case_sensitive: bool = True,
) -> list[TextBlock]:
    """Search for text blocks matching a pattern.

    Args:
        pattern: String for substring search, or compiled regex pattern.
        case_sensitive: Whether search is case-sensitive (default True).
                       Ignored if pattern is already a compiled regex.

    Returns:
        List of matching TextBlock objects.
    """
    if isinstance(pattern, re.Pattern):
        compiled = pattern
    else:
        flags = 0 if case_sensitive else re.IGNORECASE
        compiled = re.compile(re.escape(pattern), flags)

    return [block for block in self.texts if compiled.search(block.text)]

TextBlock¶

A text element with bounding box and confidence.

unifex.base.TextBlock ¶

Bases: BaseModel

Source code in unifex/base/models.py

class TextBlock(BaseModel):
    text: str
    bbox: BBox
    rotation: float = 0.0
    confidence: float | None = None
    font_info: FontInfo | None = None

BBox¶

Bounding box coordinates.

unifex.base.BBox ¶

Bases: BaseModel

Source code in unifex/base/models.py

class BBox(BaseModel):
    x0: float
    y0: float
    x1: float
    y1: float

Table¶

Extracted table with rows and cells.

unifex.base.Table ¶

Bases: BaseModel

A table extracted from a document page.

Source code in unifex/base/models.py

class Table(BaseModel):
    """A table extracted from a document page."""

    page: int  # Page number (0-indexed)
    cells: list[TableCell] = Field(default_factory=list)
    row_count: int = 0
    col_count: int = 0
    bbox: BBox | None = None  # Table bbox if available

    def to_dataframe(self) -> pd.DataFrame:
        """Convert table to pandas DataFrame.

        Returns:
            DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
            Missing cells are filled with empty strings.

        Raises:
            ImportError: If pandas is not installed.
        """
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
            ) from None

        # Build 2D grid from sparse cell list
        grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}

        # Create rows as lists
        data = []
        for row_idx in range(self.row_count):
            row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
            data.append(row)

        return pd.DataFrame(data)

to_dataframe ¶

to_dataframe() -> pd.DataFrame

Convert table to pandas DataFrame.

Returns:

Type	Description
`DataFrame`	DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
`DataFrame`	Missing cells are filled with empty strings.

Raises:

Type	Description
`ImportError`	If pandas is not installed.

Source code in unifex/base/models.py

def to_dataframe(self) -> pd.DataFrame:
    """Convert table to pandas DataFrame.

    Returns:
        DataFrame with all rows as data. Columns are numbered 0, 1, 2, ...
        Missing cells are filled with empty strings.

    Raises:
        ImportError: If pandas is not installed.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            "pandas is required for to_dataframe(). Install with: pip install unifex[tables]"
        ) from None

    # Build 2D grid from sparse cell list
    grid: dict[tuple[int, int], str] = {(cell.row, cell.col): cell.text for cell in self.cells}

    # Create rows as lists
    data = []
    for row_idx in range(self.row_count):
        row = [grid.get((row_idx, col_idx), "") for col_idx in range(self.col_count)]
        data.append(row)

    return pd.DataFrame(data)

TableCell¶

Individual cell within a table.

unifex.base.TableCell ¶

Bases: BaseModel

A cell within a table.

Source code in unifex/base/models.py

class TableCell(BaseModel):
    """A cell within a table."""

    text: str
    row: int
    col: int
    bbox: BBox | None = None  # Cell bbox if available

Result Models¶

ExtractionResult¶

Result of document extraction.

unifex.base.ExtractionResult `dataclass` ¶

Result of document extraction with all page results.

Contains the extracted document (successful pages only) and detailed results for each requested page including any errors.

Source code in unifex/base/base.py

@dataclass
class ExtractionResult:
    """Result of document extraction with all page results.

    Contains the extracted document (successful pages only) and
    detailed results for each requested page including any errors.
    """

    document: Document
    page_results: list[PageExtractionResult] = field(default_factory=list)

    @property
    def success(self) -> bool:
        """True if all requested pages were extracted successfully."""
        return all(r.success for r in self.page_results)

    @property
    def failed_pages(self) -> list[PageExtractionResult]:
        """List of failed page extraction results."""
        return [r for r in self.page_results if not r.success]

    @property
    def errors(self) -> list[tuple[int, str]]:
        """List of (page_number, error_message) for failed pages."""
        return [(r.page.page, r.error or "") for r in self.page_results if not r.success]

success `property` ¶

success: bool

True if all requested pages were extracted successfully.

failed_pages `property` ¶

failed_pages: list[PageExtractionResult]

List of failed page extraction results.

errors `property` ¶

errors: list[tuple[int, str]]

List of (page_number, error_message) for failed pages.

PageExtractionResult¶

Result of single page extraction.

unifex.base.PageExtractionResult `dataclass` ¶

Result of extracting a single page.

Source code in unifex/base/base.py

@dataclass
class PageExtractionResult:
    """Result of extracting a single page."""

    page: Page
    success: bool
    error: str | None = None

Metadata¶

ExtractorMetadata¶

Metadata about the extractor used.

unifex.base.ExtractorMetadata ¶

Bases: BaseModel

Source code in unifex/base/models.py

class ExtractorMetadata(BaseModel):
    extractor_type: ExtractorType
    creator: str | None = None
    producer: str | None = None
    title: str | None = None
    author: str | None = None
    creation_date: str | None = None
    modification_date: str | None = None
    extra: dict[str, Any] = Field(default_factory=dict)

LLM Models¶

LLMExtractionResult¶

Result of single LLM extraction request.

unifex.llm.models.LLMExtractionResult ¶

Bases: BaseModel, Generic[T]

Result of LLM extraction for a single request.

Source code in unifex/llm/models.py

class LLMExtractionResult(BaseModel, Generic[T]):
    """Result of LLM extraction for a single request."""

    model_config = {"arbitrary_types_allowed": True}

    data: T
    model: str
    provider: LLMProvider
    usage: dict[str, int] | None = None
    raw_response: Any | None = None

LLMBatchExtractionResult¶

Result of parallel LLM extraction across multiple pages.

unifex.llm.models.LLMBatchExtractionResult ¶

Bases: BaseModel, Generic[T]

Result of parallel LLM extraction across multiple pages.

Results are guaranteed to be in the same order as the input pages, i.e., results[i] corresponds to pages[i] passed to the extraction function.

Source code in unifex/llm/models.py

class LLMBatchExtractionResult(BaseModel, Generic[T]):
    """Result of parallel LLM extraction across multiple pages.

    Results are guaranteed to be in the same order as the input pages,
    i.e., results[i] corresponds to pages[i] passed to the extraction function.
    """

    model_config = {"arbitrary_types_allowed": True}

    results: list[PageExtractionResult[T]]
    model: str
    provider: LLMProvider
    total_usage: dict[str, int] | None = None

PageExtractionResult (LLM)¶

Result for a single page in batch LLM processing.

unifex.llm.models.PageExtractionResult ¶

Bases: BaseModel, Generic[T]

Result of extraction for a single page in batch processing.

Source code in unifex/llm/models.py

class PageExtractionResult(BaseModel, Generic[T]):
    """Result of extraction for a single page in batch processing."""

    model_config = {"arbitrary_types_allowed": True}

    page: int
    data: T | None = None
    usage: dict[str, int] | None = None
    error: str | None = None

LLMProvider¶

Supported LLM providers.

unifex.llm.models.LLMProvider ¶

Bases: StrEnum

Supported LLM providers.

Source code in unifex/llm/models.py

class LLMProvider(StrEnum):
    """Supported LLM providers."""

    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    GOOGLE = "google"
    AZURE_OPENAI = "azure-openai"

Models Reference¶

Core Models¶

Document¶

unifex.base.Document ¶

Page¶

unifex.base.Page ¶

search ¶

TextBlock¶

unifex.base.TextBlock ¶

BBox¶

unifex.base.BBox ¶

Table¶

unifex.base.Table ¶

to_dataframe ¶

TableCell¶

unifex.base.TableCell ¶

Result Models¶

ExtractionResult¶

unifex.base.ExtractionResult dataclass ¶

success property ¶

failed_pages property ¶

errors property ¶

PageExtractionResult¶

unifex.base.PageExtractionResult dataclass ¶

Metadata¶

ExtractorMetadata¶

unifex.base.ExtractorMetadata ¶

LLM Models¶

LLMExtractionResult¶

unifex.llm.models.LLMExtractionResult ¶

LLMBatchExtractionResult¶

unifex.llm.models.LLMBatchExtractionResult ¶

PageExtractionResult (LLM)¶

unifex.llm.models.PageExtractionResult ¶

LLMProvider¶

unifex.llm.models.LLMProvider ¶

unifex.base.ExtractionResult `dataclass` ¶

success `property` ¶

failed_pages `property` ¶

errors `property` ¶

unifex.base.PageExtractionResult `dataclass` ¶