from bs4 import BeautifulSoup
import re
import urllib.error,urllib.request
import xlwt
import sqlite3
def main():
baseurl="https://movie.douban.com/top250?start="
dataList=getData(baseurl)
dbpath="movie250.db"
saveData2db(dataList,dbpath)
findLink=re.compile(r'<a href="(.*?)">')
findName=re.compile(r'<span class="title">(.*)</span>')
findIMGSrc=re.compile(r'<img.*src="(.*?)"',re.S)
findRating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findNum=re.compile(r'<span>(\d*)人评价</span>')
findFeel=re.compile(r'<span class="inq">(.*?)</span>')
findBD=re.compile(r'<p class="">(.*?)</p>',re.S)
def getData(baseurl):
dataList=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"):
data=[]
item=str(item)
link=re.findall(findLink,item)[0]
data.append(link)
img = re.findall(findIMGSrc, item)[0]
data.append(img)
title=re.findall(findName,item)
if len(title)==2:
ctitle=title[0]
data.append(ctitle)
otitle=title[1].replace("/","")
data.append(otitle)
else:
data.append(title[0])
data.append(" ")
rating= re.findall(findRating, item)[0]
data.append(rating)
num = re.findall(findNum, item)[0]
data.append(num)
feel = re.findall(findFeel, item)
if len(feel) !=0:
feel=feel[0].replace("。","")
data.append(feel)
else:
data.append(" ")
base = re.findall(findBD, item)[0]
base=re.sub("<br(/s+)?/>"," ",base)
base=re.sub("/"," ",base)
data.append(base.strip())
dataList.append(data)
return dataList
def saveData2db(dataList,dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in dataList:
for i in range(len(data)):
if i == 4 or i == 5:
continue
data[i] = '"'+data[i]+'"'
sql = '''
insert into movie250 ( /*存在缩进问题,保持书写格式*/
info_link,pic_link,cname,oname,score,people,feel,near)
values (%s)''' % ",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db (dbpath):
sql='''
create table movie250
(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar ,
oname varchar ,
score numeric ,/*显示小数*/
people number ,
feel text,
near text
);
'''
conn=sqlite3.connect(dbpath)
curson=conn.cursor()
curson.execute(sql)
conn.commit()
conn.close()
def askURL(url):
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/32"
}
request=urllib.request.Request(url,headers=header)
html = ""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
except Exception as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__=="__main__":
main()
print("爬取结束")