|
@@ -1,17 +1,14 @@
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
|
|
+# SimilarSentenceSplitter.py
|
|
|
from typing import List
|
|
from typing import List
|
|
|
from .Splitter import Splitter
|
|
from .Splitter import Splitter
|
|
|
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
class SimilarSentenceSplitter(Splitter):
|
|
class SimilarSentenceSplitter(Splitter):
|
|
|
-
|
|
|
|
|
- def __init__(self, similarity_model, sentence_splitter: Splitter):
|
|
|
|
|
|
|
+ def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
|
|
|
self.model = similarity_model
|
|
self.model = similarity_model
|
|
|
self.sentence_splitter = sentence_splitter
|
|
self.sentence_splitter = sentence_splitter
|
|
|
|
|
+ self.group_max_sentences = group_max_sentences
|
|
|
|
|
|
|
|
- def split(self, text: str, group_max_sentences=5) -> List[str]:
|
|
|
|
|
|
|
+ def split(self, text: str) -> List[List[str]]:
|
|
|
'''
|
|
'''
|
|
|
group_max_sentences: The maximum number of sentences in a group.
|
|
group_max_sentences: The maximum number of sentences in a group.
|
|
|
'''
|
|
'''
|
|
@@ -20,20 +17,19 @@ class SimilarSentenceSplitter(Splitter):
|
|
|
if len(sentences) == 0:
|
|
if len(sentences) == 0:
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
-
|
|
|
|
|
similarities = self.model.similarities(sentences)
|
|
similarities = self.model.similarities(sentences)
|
|
|
|
|
|
|
|
# The first sentence is always in the first group.
|
|
# The first sentence is always in the first group.
|
|
|
groups = [[sentences[0]]]
|
|
groups = [[sentences[0]]]
|
|
|
|
|
|
|
|
- # Using the group min/max sentences contraints,
|
|
|
|
|
|
|
+ # Using the group min/max sentences constraints,
|
|
|
# group together the rest of the sentences.
|
|
# group together the rest of the sentences.
|
|
|
for i in range(1, len(sentences)):
|
|
for i in range(1, len(sentences)):
|
|
|
- if len(groups[-1]) >= group_max_sentences:
|
|
|
|
|
|
|
+ if len(groups[-1]) >= self.group_max_sentences:
|
|
|
groups.append([sentences[i]])
|
|
groups.append([sentences[i]])
|
|
|
elif similarities[i-1] >= self.model.similarity_threshold:
|
|
elif similarities[i-1] >= self.model.similarity_threshold:
|
|
|
groups[-1].append(sentences[i])
|
|
groups[-1].append(sentences[i])
|
|
|
else:
|
|
else:
|
|
|
groups.append([sentences[i]])
|
|
groups.append([sentences[i]])
|
|
|
|
|
|
|
|
- return groups
|
|
|
|
|
|
|
+ return groups
|