test_similar_sentences_splitter.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import pytest
  2. from utls import load_testdata
  3. from semantic_split import SentenceTransformersSimilarity, \
  4. SimilarSentenceSplitter, SpacySentenceSplitter
  5. splitter = None
  6. # Loading Spacy and The SentenceTransformer takes time, so we do it once for all tests.
  7. @pytest.fixture(autouse=True)
  8. def run_before_and_after_tests(tmpdir):
  9. global splitter
  10. model = SentenceTransformersSimilarity()
  11. sentence_splitter = SpacySentenceSplitter()
  12. splitter = SimilarSentenceSplitter(similarity_model = model,
  13. sentence_splitter=sentence_splitter)
  14. yield # this is where the testing happens
  15. def test_two_similar_sentences():
  16. text = """I love dogs. I love cats."""
  17. res = splitter.split(text)
  18. assert res == [["I love dogs.", "I love cats."]]
  19. def test_similar_sentences():
  20. text = """I dogs are amazing.
  21. Cats must be the easiest pets around.
  22. Robots are advanced now with AI.
  23. Flying in space can only be done by Artificial intelligence."""
  24. res = splitter.split(text)
  25. assert res == [
  26. ["I dogs are amazing.",
  27. "Cats must be the easiest pets around."],
  28. ["Robots are advanced now with AI.",
  29. "Flying in space can only be done by Artificial intelligence."]]
  30. def test_different_sentences():
  31. text = """I love dogs. He has flowers at home."""
  32. res = splitter.split(text)
  33. print(res)
  34. assert res[0][0] == 'I love dogs.'
  35. assert res[1][0] == 'He has flowers at home.'
  36. assert res == [['I love dogs.'], ['He has flowers at home.']]
  37. def test_5th_sentences():
  38. text = load_testdata('sentences.txt')
  39. res = splitter.split(text)
  40. assert len(res) == 5
  41. def test_max_group_sentences():
  42. text = load_testdata('sentences.txt')
  43. res = splitter.split(text, group_max_sentences=1)
  44. assert len(res) == 20