Coverage for functions \ flipdare \ core \ tokenizer.py: 91%
96 statements
« prev ^ index » next coverage.py v7.13.0, created at 2026-05-08 12:22 +1000
« prev ^ index » next coverage.py v7.13.0, created at 2026-05-08 12:22 +1000
1#!/usr/bin/env python
2# Copyright (c) 2026 Flipdare Pty Ltd. All rights reserved.
3#
4# This file is part of Flipdare's proprietary software and contains
5# confidential and copyrighted material. Unauthorised copying,
6# modification, distribution, or use of this file is strictly
7# prohibited without prior written permission from Flipdare Pty Ltd.
8#
9# This software includes third-party components licensed under MIT,
10# BSD, and Apache 2.0 licences. See THIRD_PARTY_NOTICES for details.
11#
14from dataclasses import dataclass
15import spacy
17from flipdare.core.singleton import Singleton
18from flipdare.generated.shared.model.token_score import TokenScore
20__all__ = ["Tokenizer", "TokenizerResult"]
23@dataclass
24class TokenizerResult:
25 tokens: list[str]
26 token_score: TokenScore
29@dataclass
30class Tokens:
31 low: list[str]
32 med: list[str]
33 high: list[str]
35 @property
36 def has_tokens(self) -> bool:
37 return len(self.low) > 0 or len(self.med) > 0 or len(self.high) > 0
39 @property
40 def low_only(self) -> bool:
41 return len(self.med) == 0 and len(self.high) == 0
43 @property
44 def has_low(self) -> bool:
45 return len(self.low) > 0
47 @property
48 def has_med(self) -> bool:
49 return len(self.med) > 0
51 @property
52 def has_high(self) -> bool:
53 return len(self.high) > 0
56class Tokenizer(Singleton):
58 def __init__(self, nlp: spacy.language.Language | None = None) -> None:
59 if nlp is None:
60 nlp = spacy.load("en_core_web_sm")
61 self._nlp = nlp
63 @property
64 def nlp(self) -> spacy.language.Language:
65 return self._nlp
67 def is_person_name(self, text: str) -> bool:
68 doc = self._nlp(text)
69 # Check if any entity found is a PERSON
70 return any(ent.label_ == "PERSON" for ent in doc.ents)
72 def create_tokens(self, value: str | list[str]) -> TokenizerResult:
73 entries: list[str] = []
74 fallback: str
76 if isinstance(value, str):
77 fallback = value
78 entries.append(value)
79 else:
80 fallback = " ".join(value)
81 entries.extend(value)
83 tokens: list[str] = []
84 max_score = TokenScore.LOW
85 for entry in entries:
86 tag_results = self._get_tokens(entry)
87 for tag_result in tag_results:
88 if tag_result.has_low:
89 tokens.extend(tag_result.low)
91 if tag_result.has_med:
92 tokens.extend(tag_result.med)
93 if max_score != TokenScore.HIGH:
94 max_score = TokenScore.MEDIUM
96 if tag_result.has_high:
97 tokens.extend(tag_result.high)
98 max_score = TokenScore.HIGH
100 # remove duplicates while preserving order
101 tokens = list(dict.fromkeys(tokens))
102 if len(tokens) == 0:
103 return TokenizerResult(tokens=[fallback], token_score=TokenScore.LOW)
105 return TokenizerResult(tokens=tokens, token_score=max_score)
107 def _get_tokens(self, value: str | list[str]) -> "list[Tokens]":
108 values: list[str] = []
109 if isinstance(value, str):
110 values.append(value)
111 else:
112 values.extend(value)
114 result: list[Tokens] = []
115 for val in values:
116 tokens, named_entities = self._generate(val)
118 if len(tokens) == 0 and len(named_entities) == 0:
119 # LOG().warning(f'No tokens found for "{val}"')
120 result.append(Tokens([val], [], []))
121 continue
123 # LOG().debug(f"Found {len(tokens)} tokens and "
124 # f"{len(named_entities)} named entities for value '{value}'")
126 result.append(Tokens([], tokens, named_entities))
128 return result
130 def _generate(self, text: str) -> tuple[list[str], list[str]]:
131 # Process the original text to get named entities with full context
132 doc = self._nlp(text)
134 # Extract named entities first (with original context)
135 named_entities: list[str] = []
136 entity_token_texts: set[str] = set()
138 for ent in doc.ents:
139 # Get the entity text without stop words/punctuation
140 entity_tokens = [
141 token.text for token in ent if not token.is_stop and not token.is_punct
142 ]
143 if entity_tokens:
144 named_text = " ".join(entity_tokens)
145 named_entities.append(named_text)
146 # Track individual tokens that are part of entities
147 entity_token_texts.update(entity_tokens)
149 # Extract regular tokens (non-stop, non-punct, not part of named entities)
150 # This preserves the original left-to-right order from the source text
151 found_tokens: list[str] = []
152 for token in doc:
153 if token.is_stop or token.is_punct:
154 continue
155 token_text = token.text.strip()
156 if token_text not in entity_token_texts:
157 found_tokens.append(token_text)
159 # LOG().debug(f'Named Entities: {named_entities} Tokens: {found_tokens}')
160 return found_tokens, named_entities