8000 Merge pull request #120 from u2d-ai/0.0.114 · u2d-ai/msaDocModels@cce1133 · GitHub
[go: up one dir, main page]

Skip to content

Commit cce1133

Browse files
Merge pull request #120 from u2d-ai/0.0.114
feat: added models foe ExtractorUniversal
2 parents 8a5f8e2 + 766d0d5 commit cce1133

File tree

4 files changed

+157
-2
lines changed

4 files changed

+157
-2
lines changed

docs/release-notes.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# msaDocModels Release Notes
22
## Possible future features:
33

4+
# 0.0.114
5+
6+
- add models for ExtractorUniversal
7+
48
# 0.0.113
59

610
- add ProcessingDocumentsInput model

msaDocModels/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import glob
22
from os.path import basename, dirname, isfile, join
33

4-
version = "0.0.113"
4+
version = "0.0.114"
55
__author__ = "Stefan Welcker"
66
__copyright__ = "Copyright 2022, U2D.ai"
77
__license__ = "MIT"

msaDocModels/sdu.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7697,3 +7697,154 @@ class UnzipInputModel(BaseModel):
76977697

76987698
archive_path: str
76997699
unpack_nested: Optional[bool] = False
7700+
7701+
7702+
class ExtractorUniversalBaseInput(BaseModel):
7703+
"""
7704+
Base input model for the universal entity extraction.
7705+
7706+
Attributes:
7707+
7708+
entity_type: A dictionary specifying entity types and their values.
7709+
max_new_tokens : The maximum number of new tokens to generate. Defaults to 256.
7710+
"""
7711+
7712+
entity_type: Dict[str, List[str]]
7713+
max_new_tokens: int = 256
7714+
7715+
7716+
class ExtractorUniversalDocumentInput(ExtractorUniversalBaseInput, BaseDocumentInput):
7717+
"""
7718+
Input model for document-level entity extraction.
7719+
7720+
Attributes:
7721+
7722+
result_output: The type of result to output (e.g., 'document', 'pages', 'paragraphs', 'sentences').
7723+
Defaults to ResultType.document.
7724+
"""
7725+
7726+
result_output: ResultType = ResultType.document
7727+
7728+
7729+
class ExtractorUniversalTextInput(ExtractorUniversalBaseInput):
7730+
"""
7731+
Input model for text-level entity extraction.
7732+
7733+
Attributes:
7734+
7735+
input_text: The input text or texts to extract entities from.
7736+
"""
7737+
7738+
input_text: Union[str, List[str], Dict[Any, str]]
7739+
7740+
7741+
class ExtractorUniversalEntity(BaseModel):
7742+
"""
7743+
Model representing an extracted entity.
7744+
7745+
Attributes:
7746+
7747+
result: The extracted entity text.
7748+
positions: The positions of the extracted entity in the input text.
7749+
"""
7750+
7751+
result: str
7752+
positions: List[Dict[str, int]]
7753+
7754+
7755+
class ExtractorUniversalDTO(BaseModel):
7756+
"""
7757+
Model that represents the result from universal entity extraction.
7758+
7759+
Attributes:
7760+
7761+
prediction:
7762+
7763+
The extracted entities organized by entity type.
7764+
"""
7765+
7766+
prediction: Union[
7767+
Dict[str, List[ExtractorUniversalEntity]],
7768+
List[Dict[str, List[ExtractorUniversalEntity]]],
7769+
Dict[str, Dict[str, List[ExtractorUniversalEntity]]],
7770+
]
7771+
7772+
7773+
class ExtractorUniversalPageResult(NestingId):
7774+
"""
7775+
Model representing the result of entity extraction for a page.
7776+
7777+
Attributes:
7778+
7779+
result: The extracted entities organized by entity type.
7780+
"""
7781+
7782+
result: Dict[str, List[ExtractorUniversalEntity]]
7783+
7784+
7785+
class ExtractorUniversalSentenceResult(NestingId):
7786+
"""
7787+
Model representing the result of entity extraction for a sentence.
7788+
7789+
Attributes:
7790+
7791+
result: The extracted entities organized by entity type.
7792+
"""
7793+
7794+
result: Dict[str, List[ExtractorUniversalEntity]]
7795+
7796+
7797+
class ExtractorUniversalParagraphSentences(NestingId):
7798+
"""
7799+
Model representing the result of entity extraction for paragraphs with sentences.
7800+
7801+
Attributes:
7802+
7803+
sentences: List of sentence-level extraction results.
7804+
"""
7805+
7806+
sentences: List[ExtractorUniversalSentenceResult]
7807+
7808+
7809+
class ExtractorUniversalParagraphResult(NestingId):
7810+
"""
7811+
Model representing the result of entity extraction for a paragraph.
7812+
7813+
Attributes:
7814+
7815+
result: The extracted entities organized by entity type.
7816+
"""
7817+
7818+
result: Dict[str, List[ExtractorUniversalEntity]]
7819+
7820+
7821+
class ExtractorUniversalPageParagraphs(NestingId):
7822+
"""
7823+
Model representing the result of entity extraction for pages with paragraphs.
7824+
7825+
Attributes:
7826+
7827+
paragraphs: List of paragraph-level extraction results.
7828+
"""
7829+
7830+
paragraphs: Union[
7831+
List[ExtractorUniversalParagraphResult],
7832+
List[ExtractorUniversalParagraphSentences],
7833+
]
7834+
7835+
7836+
class ExtractorUniversalDocumentDTO(BaseModel):
7837+
"""
7838+
Model representing the result of universal entity extraction for a document.
7839+
7840+
Attributes:
7841+
7842+
extractor_universal: The extracted entities organized by entity type at various levels
7843+
(document, page, paragraph, sentence).
7844+
"""
7845+
7846+
extractor_universal: Union[
7847+
Dict[str, List[ExtractorUniversalEntity]],
7848+
List[ExtractorUniversalPageResult],
7849+
List[ExtractorUniversalPageParagraphs],
7850+
]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "msaDocModels"
3-
version = "0.0.113"
3+
version = "0.0.114"
44
description = "MSA Document Pydantic Models and Schemas, used to store Parser, NLP, NLU and AI results for processed documents"
55
authors = ["Stefan Welcker"]
66
readme = "README.md"

0 commit comments

Comments
 (0)
0