chinese_text_splitter.py 848 B

12345678910111213141516171819202122232425
  1. from langchain.text_splitter import CharacterTextSplitter
  2. import re
  3. from typing import List
  4. class ChineseTextSplitter(CharacterTextSplitter):
  5. def __init__(self, pdf: bool = False, **kwargs):
  6. super().__init__(**kwargs)
  7. self.pdf = pdf
  8. def split_text(self, text: str) -> List[str]:
  9. if self.pdf:
  10. text = re.sub(r"\n{3,}", "\n", text)
  11. text = re.sub('\s', ' ', text)
  12. text = text.replace("\n\n", "")
  13. sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
  14. sent_list = []
  15. for ele in sent_sep_pattern.split(text):
  16. if sent_sep_pattern.match(ele) and sent_list:
  17. sent_list[-1] += ele
  18. elif ele:
  19. sent_list.append(ele)
  20. return sent_list