Coverage for functions\flipdare\core\tokenizer.py: 91%

1#!/usr/bin/env python

4# This file is part of Flipdare's proprietary software and contains

5# confidential and copyrighted material. Unauthorised copying,

6# modification, distribution, or use of this file is strictly

7# prohibited without prior written permission from Flipdare Pty Ltd.

9# This software includes third-party components licensed under MIT,

10# BSD, and Apache 2.0 licences. See THIRD_PARTY_NOTICES for details.

11#

14from dataclasses import dataclass

15import spacy

17from flipdare.core.singleton import Singleton

18from flipdare.generated.shared.model.token_score import TokenScore

20__all__ = ["Tokenizer", "TokenizerResult"]

23@dataclass

24class TokenizerResult:

25 tokens: list[str]

26 token_score: TokenScore

29@dataclass

30class Tokens:

31 low: list[str]

32 med: list[str]

33 high: list[str]

35 @property

36 def has_tokens(self) -> bool:

37 return len(self.low) > 0 or len(self.med) > 0 or len(self.high) > 0

39 @property

40 def low_only(self) -> bool:

41 return len(self.med) == 0 and len(self.high) == 0

43 @property

44 def has_low(self) -> bool:

45 return len(self.low) > 0

47 @property

48 def has_med(self) -> bool:

49 return len(self.med) > 0

51 @property

52 def has_high(self) -> bool:

53 return len(self.high) > 0

56class Tokenizer(Singleton):

58 def __init__(self, nlp: spacy.language.Language | None = None) -> None:

59 if nlp is None:

60 nlp = spacy.load("en_core_web_sm")

61 self._nlp = nlp

63 @property

64 def nlp(self) -> spacy.language.Language:

65 return self._nlp

67 def is_person_name(self, text: str) -> bool:

68 doc = self._nlp(text)

69 # Check if any entity found is a PERSON

70 return any(ent.label_ == "PERSON" for ent in doc.ents)

72 def create_tokens(self, value: str | list[str]) -> TokenizerResult:

73 entries: list[str] = []

74 fallback: str

76 if isinstance(value, str):

77 fallback = value

78 entries.append(value)

79 else:

80 fallback = " ".join(value)

81 entries.extend(value)

83 tokens: list[str] = []

84 max_score = TokenScore.LOW

85 for entry in entries:

86 tag_results = self._get_tokens(entry)

87 for tag_result in tag_results:

88 if tag_result.has_low:

89 tokens.extend(tag_result.low)

91 if tag_result.has_med:

92 tokens.extend(tag_result.med)

93 if max_score != TokenScore.HIGH:

94 max_score = TokenScore.MEDIUM

96 if tag_result.has_high:

97 tokens.extend(tag_result.high)

98 max_score = TokenScore.HIGH

100 # remove duplicates while preserving order

101 tokens = list(dict.fromkeys(tokens))

102 if len(tokens) == 0:

103 return TokenizerResult(tokens=[fallback], token_score=TokenScore.LOW)

104

105 return TokenizerResult(tokens=tokens, token_score=max_score)

106

107 def _get_tokens(self, value: str | list[str]) -> "list[Tokens]":

108 values: list[str] = []

109 if isinstance(value, str):

110 values.append(value)

111 else:

112 values.extend(value)

113

114 result: list[Tokens] = []

115 for val in values:

116 tokens, named_entities = self._generate(val)

117

118 if len(tokens) == 0 and len(named_entities) == 0:

119 # LOG().warning(f'No tokens found for "{val}"')

120 result.append(Tokens([val], [], []))

121 continue

122

123 # LOG().debug(f"Found {len(tokens)} tokens and "

124 # f"{len(named_entities)} named entities for value '{value}'")

125

126 result.append(Tokens([], tokens, named_entities))

127

128 return result

129

130 def _generate(self, text: str) -> tuple[list[str], list[str]]:

131 # Process the original text to get named entities with full context

132 doc = self._nlp(text)

133

134 # Extract named entities first (with original context)

135 named_entities: list[str] = []

136 entity_token_texts: set[str] = set()

137

138 for ent in doc.ents:

139 # Get the entity text without stop words/punctuation

140 entity_tokens = [

141 token.text for token in ent if not token.is_stop and not token.is_punct

142 ]

143 if entity_tokens:

144 named_text = " ".join(entity_tokens)

145 named_entities.append(named_text)

146 # Track individual tokens that are part of entities

147 entity_token_texts.update(entity_tokens)

148

149 # Extract regular tokens (non-stop, non-punct, not part of named entities)

150 # This preserves the original left-to-right order from the source text

151 found_tokens: list[str] = []

152 for token in doc:

153 if token.is_stop or token.is_punct:

154 continue

155 token_text = token.text.strip()

156 if token_text not in entity_token_texts:

157 found_tokens.append(token_text)

158

159 # LOG().debug(f'Named Entities: {named_entities} Tokens: {found_tokens}')

160 return found_tokens, named_entities

Coverage for functions \ flipdare \ core \ tokenizer.py: 91%

96 statements