فهرست منبع

Adapted to change Spacy Model, First Try.

Alexander Huwiler 1 سال پیش
والد
کامیت
46a8a6a7fd

+ 5 - 5
semantic_split/SentenceSimilarity.py

@@ -1,14 +1,14 @@
+# SentenceSimilarity.py
 from typing import List
 from sentence_transformers import SentenceTransformer, util
 
-class SentenceTransformersSimilarity():
-    def __init__(self, model='all-MiniLM-L6-v2', similarity_threshold=0.2):
+class SentenceTransformersSimilarity:
+    def __init__(self, model: str = 'all-MiniLM-L6-v2', similarity_threshold: float = 0.2):
         self.model = SentenceTransformer(model)
         self.similarity_threshold = similarity_threshold
 
-
-    def similarities(self, sentences: List[str]):
-        # Encode all sentences 
+    def similarities(self, sentences: List[str]) -> List[float]:
+        # Encode all sentences
         embeddings = self.model.encode(sentences)
 
         # Calculate cosine similarities for neighboring sentences

+ 3 - 3
semantic_split/__init__.py

@@ -1,4 +1,4 @@
-
+# __init__.py
 # Similarity models
 from .SentenceSimilarity import SentenceTransformersSimilarity
 
@@ -7,5 +7,5 @@ from .splitters.Splitter import Splitter
 from .splitters.SpacySentenceSplitter import SpacySentenceSplitter
 from .splitters.SimilarSentenceSplitter import SimilarSentenceSplitter
 
-__all__ = ['SentenceTransformersSimilarity', 'Splitter', 'SpacySentenceSplitter', 
-           'SimilarSentenceSplitter' ]
+__all__ = ['SentenceTransformersSimilarity', 'Splitter', 'SpacySentenceSplitter',
+           'SimilarSentenceSplitter']

+ 7 - 11
semantic_split/splitters/SimilarSentenceSplitter.py

@@ -1,17 +1,14 @@
-
-
+# SimilarSentenceSplitter.py
 from typing import List
 from .Splitter import Splitter
 
-
-
 class SimilarSentenceSplitter(Splitter):
-
-    def __init__(self, similarity_model, sentence_splitter: Splitter):
+    def __init__(self, similarity_model, sentence_splitter: Splitter, group_max_sentences: int = 5):
         self.model = similarity_model
         self.sentence_splitter = sentence_splitter
+        self.group_max_sentences = group_max_sentences
 
-    def split(self, text: str, group_max_sentences=5) -> List[str]:
+    def split(self, text: str) -> List[List[str]]:
         '''
             group_max_sentences: The maximum number of sentences in a group.
         '''
@@ -20,20 +17,19 @@ class SimilarSentenceSplitter(Splitter):
         if len(sentences) == 0:
             return []
 
-        
         similarities = self.model.similarities(sentences)
 
         # The first sentence is always in the first group.
         groups = [[sentences[0]]]
 
-        # Using the group min/max sentences contraints, 
+        # Using the group min/max sentences constraints,
         # group together the rest of the sentences.
         for i in range(1, len(sentences)):
-            if len(groups[-1]) >= group_max_sentences:
+            if len(groups[-1]) >= self.group_max_sentences:
                 groups.append([sentences[i]])
             elif similarities[i-1] >= self.model.similarity_threshold:
                 groups[-1].append(sentences[i])
             else:
                 groups.append([sentences[i]])
 
-        return groups
+        return groups

+ 3 - 6
semantic_split/splitters/SpacySentenceSplitter.py

@@ -1,14 +1,11 @@
-
+# SpacySentenceSplitter.py
 from typing import List
 from .Splitter import Splitter
-
 import spacy
 
-
 class SpacySentenceSplitter(Splitter):
-
-    def __init__(self):
-        self.nlp = spacy.load("en_core_web_sm")
+    def __init__(self, model: str = "en_core_web_sm"):
+        self.nlp = spacy.load(model)
 
     def split(self, text: str) -> List[str]:
         doc = self.nlp(text)