lxml库
parser = etree.HTMLParser(encoding="gbk")
html = etree.parse(html, parser=parser) # 输入为html文件,增加解析器answers = html.xpath('//*/div[@class="l-tbody"]/div[@class="art-text"]/p')
answer = ""
for i in answers:
i = etree.tostring(i, encoding='gbk')
i = i.decode('gbk')
i = re.sub(r' |\n|\r|\t|
|;|', "", i)
answer = answer + "\n" + re.sub(r'<.*?>', front, i)Last updated