1 年間前 · 46a8a6a7fd
--- a/semantic_split/SentenceSimilarity.py
+++ b/semantic_split/SentenceSimilarity.py
@@ -1,14 +1,14 @@
 
															+# SentenceSimilarity.py
														
 
															 from typing import List
														
 
															 from sentence_transformers import SentenceTransformer, util
														
 
															-class SentenceTransformersSimilarity():
														
 
															-    def __init__(self, model='all-MiniLM-L6-v2', similarity_threshold=0.2):
														
 
															+class SentenceTransformersSimilarity:
														
 
															+    def __init__(self, model: str = 'all-MiniLM-L6-v2', similarity_threshold: float = 0.2):
														
 
															         self.model = SentenceTransformer(model)
														
 
															         self.similarity_threshold = similarity_threshold
														
 
															-
														
 
															-    def similarities(self, sentences: List[str]):
														
 
															-        # Encode all sentences 
														
 
															+    def similarities(self, sentences: List[str]) -> List[float]:
														
 
															+        # Encode all sentences
														
 
															         embeddings = self.model.encode(sentences)
														
 
															         # Calculate cosine similarities for neighboring sentences
														
--- a/semantic_split/__init__.py
+++ b/semantic_split/__init__.py
@@ -1,4 +1,4 @@
 
															-
														
 
															+# __init__.py
														
 
															 # Similarity models
														
 
															 from .SentenceSimilarity import SentenceTransformersSimilarity
														
@@ -7,5 +7,5 @@ from .splitters.Splitter import Splitter
 
															 from .splitters.SpacySentenceSplitter import SpacySentenceSplitter
														
 
															 from .splitters.SimilarSentenceSplitter import SimilarSentenceSplitter
														
 
															-__all__ = ['SentenceTransformersSimilarity', 'Splitter', 'SpacySentenceSplitter', 
														
 
															-           'SimilarSentenceSplitter' ]
														
 
															+__all__ = ['SentenceTransformersSimilarity', 'Splitter', 'SpacySentenceSplitter',
														
 
															+           'SimilarSentenceSplitter']
														
--- a/semantic_split/splitters/SimilarSentenceSplitter.py
+++ b/semantic_split/splitters/SimilarSentenceSplitter.py
@@ -1,17 +1,14 @@
 
															-
														
 
															-
														
 
															+# SimilarSentenceSplitter.py
														
 
															 from typing import List
														
 
															 from .Splitter import Splitter
														
 
															-
														
 
															-
														
 
															 class SimilarSentenceSplitter(Splitter):
														
 
															-
														
 
															-    def __init__(self, similarity_model, sentence_splitter: Splitter):
														
 
															+    def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
														
 
															         self.model = similarity_model
														
 
															         self.sentence_splitter = sentence_splitter
														
 
															+        self.group_max_sentences = group_max_sentences
														
 
															-    def split(self, text: str, group_max_sentences=5) -> List[str]:
														
 
															+    def split(self, text: str) -> List[List[str]]:
														
 
															         '''
														
 
															             group_max_sentences: The maximum number of sentences in a group.
														
 
															         '''
														
@@ -20,20 +17,19 @@ class SimilarSentenceSplitter(Splitter):
 
															         if len(sentences) == 0:
														
 
															             return []
														
 
															-        
														
 
															         similarities = self.model.similarities(sentences)
														
 
															         # The first sentence is always in the first group.
														
 
															         groups = [[sentences[0]]]
														
 
															-        # Using the group min/max sentences contraints, 
														
 
															+        # Using the group min/max sentences constraints,
														
 
															         # group together the rest of the sentences.
														
 
															         for i in range(1, len(sentences)):
														
 
															-            if len(groups[-1]) >= group_max_sentences:
														
 
															+            if len(groups[-1]) >= self.group_max_sentences:
														
 
															                 groups.append([sentences[i]])
														
 
															             elif similarities[i-1] >= self.model.similarity_threshold:
														
 
															                 groups[-1].append(sentences[i])
														
 
															             else:
														
 
															                 groups.append([sentences[i]])
														
 
															-        return groups
														
 
															+        return groups
														
--- a/semantic_split/splitters/SpacySentenceSplitter.py
+++ b/semantic_split/splitters/SpacySentenceSplitter.py
@@ -1,14 +1,11 @@
 
															-
														
 
															+# SpacySentenceSplitter.py
														
 
															 from typing import List
														
 
															 from .Splitter import Splitter
														
 
															-
														
 
															 import spacy
														
 
															-
														
 
															 class SpacySentenceSplitter(Splitter):
														
 
															-
														
 
															-    def __init__(self):
														
 
															-        self.nlp = spacy.load("en_core_web_sm")
														
 
															+    def __init__(self, model: str = "en_core_web_sm"):
														
 
															+        self.nlp = spacy.load(model)
														
 
															     def split(self, text: str) -> List[str]:
														
 
															         doc = self.nlp(text)