| 1234567891011121314151617181920212223242526272829303132333435 |
- # SimilarSentenceSplitter.py
- from typing import List
- from .Splitter import Splitter
- class SimilarSentenceSplitter(Splitter):
- def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
- self.model = similarity_model
- self.sentence_splitter = sentence_splitter
- self.group_max_sentences = group_max_sentences
- def split(self, text: str) -> List[List[str]]:
- '''
- group_max_sentences: The maximum number of sentences in a group.
- '''
- sentences = self.sentence_splitter.split(text)
- if len(sentences) == 0:
- return []
- similarities = self.model.similarities(sentences)
- # The first sentence is always in the first group.
- groups = [[sentences[0]]]
- # Using the group min/max sentences constraints,
- # group together the rest of the sentences.
- for i in range(1, len(sentences)):
- if len(groups[-1]) >= self.group_max_sentences:
- groups.append([sentences[i]])
- elif similarities[i-1] >= self.model.similarity_threshold:
- groups[-1].append(sentences[i])
- else:
- groups.append([sentences[i]])
- return groups
|