[笔记]利用python中的re模块正则匹配
用re模块写正则表达式匹配文本
代码中的test.txt 是利用requests请求得到的html文本
https://www.maoyan.com/board/4?offset=0
import json
import re
import requests
def test_re():
url = 'https://pz.wendu.com/'
response = requests.get(url)
data = response.text
# print(data)
res = re.findall(r'<a target=.*?Chatpre.*?>(.*?)</a>', data)
print(res)
def test_re2():
with open('test.txt', 'r', encoding='utf-8') as f:
content = f.read()
f.close()
pattern = re.compile(
r'<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?"star">(.*?)</p>.*?"releasetime">(.*?)</p>.*?score"><i.*?>(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',
re.S)
items = re.findall(pattern, content)
for item in items:
# print(item)
yield {
'index': item[0],
'data-src': item[1],
'title': item[2],
'star':str(item[3]).strip()[3:],
'time':str(item[4]).strip()[5:],
'score':str(item[5])+str(item[6])
}
if __name__ == '__main__':
for item in test_re2():
print(item)
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(item,ensure_ascii=False)+'\n')
f.close()