SimilarSentenceSplitter.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435
  1. # SimilarSentenceSplitter.py
  2. from typing import List
  3. from .Splitter import Splitter
  4. class SimilarSentenceSplitter(Splitter):
  5. def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
  6. self.model = similarity_model
  7. self.sentence_splitter = sentence_splitter
  8. self.group_max_sentences = group_max_sentences
  9. def split(self, text: str) -> List[List[str]]:
  10. '''
  11. group_max_sentences: The maximum number of sentences in a group.
  12. '''
  13. sentences = self.sentence_splitter.split(text)
  14. if len(sentences) == 0:
  15. return []
  16. similarities = self.model.similarities(sentences)
  17. # The first sentence is always in the first group.
  18. groups = [[sentences[0]]]
  19. # Using the group min/max sentences constraints,
  20. # group together the rest of the sentences.
  21. for i in range(1, len(sentences)):
  22. if len(groups[-1]) >= self.group_max_sentences:
  23. groups.append([sentences[i]])
  24. elif similarities[i-1] >= self.model.similarity_threshold:
  25. groups[-1].append(sentences[i])
  26. else:
  27. groups.append([sentences[i]])
  28. return groups