alexander
/
semantic_split_MultiLang


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435
							# SimilarSentenceSplitter.py
from typing import List
from .Splitter import Splitter

class SimilarSentenceSplitter(Splitter):
    def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
        self.model = similarity_model
        self.sentence_splitter = sentence_splitter
        self.group_max_sentences = group_max_sentences

    def split(self, text: str) -> List[List[str]]:
        '''
            group_max_sentences: The maximum number of sentences in a group.
        '''
        sentences = self.sentence_splitter.split(text)

        if len(sentences) == 0:
            return []

        similarities = self.model.similarities(sentences)

        # The first sentence is always in the first group.
        groups = [[sentences[0]]]

        # Using the group min/max sentences constraints,
        # group together the rest of the sentences.
        for i in range(1, len(sentences)):
            if len(groups[-1]) >= self.group_max_sentences:
                groups.append([sentences[i]])
            elif similarities[i-1] >= self.model.similarity_threshold:
                groups[-1].append(sentences[i])
            else:
                groups.append([sentences[i]])

        return groups