python 文本清洗(网址、邮箱HTML转义字符)、自动分段等
记录下这个模块,日后用得着:
from harvesttext import HarvestText ht = HarvestText() print("各种清洗文本") ht0 = HarvestText() # 默认的设置可用于清洗微博文本 text1 = "回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good]" print("清洗微博【@和表情符等】") print("原:", text1) print("清洗后:", ht0.clean_text(text1)) # URL的清理 text1 = "【#赵薇#:正筹备下一部电影 但不是青春片....http://t.cn/8FLopdQ" print("清洗网址URL") print("原:", text1) print("清洗后:", ht0.clean_text(text1, remove_url=True)) # 清洗邮箱 text1 = "我的邮箱是abc@demo.com,欢迎联系" print("清洗邮箱") print("原:", text1) print("清洗后:", ht0.clean_text(text1, email=True)) # 处理URL转义字符 text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com" print("URL转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_url=True, remove_url=False)) text1 = "www.中文 and space.com" print("正常字符转URL[含有中文和空格的request需要注意]") print("原:", text1) print("清洗后:", ht0.clean_text(text1, to_url=True, remove_url=False)) # 处理HTML转义字符 text1 = "<a c> ''" print("HTML转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_html=True)) # 繁体字转简体 text1 = "心碎誰買單" print("繁体字转简体") print("原:", text1) print("清洗后:", ht0.clean_text(text1, t2s=True)) github :https://github.com/blmoistawinde/HarvestText