以豆瓣TOP250为例 https://movie.douban.com/top250 ,该网页包含250条电影内容,25条为一页,每页链接为 https://movie.douban.com/top250?start=
传入参数可为包含的任何数,按照25条一页,传入参数为 0 25 50 ... 225
调用库 需要用到以下库
1 2 3 4 5 from bs4 import BeautifulSoup import urllib.requestimport xlwt import re import sqlite3
请求网页 1 2 3 4 5 6 7 8 9 10 11 12 def askUrl (url) : headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWe bKit/537.36 (KHTML, like Gecko) Chrome/93.0.4544.0 Safari/537.36 Edg/93.0.933.1" , } request = urllib.request.Request(url,headers=headers) try : response = urllib.request.urlopen(request) html = response.read().decode("utf-8" ) except Exception as err: print("出现错误" ,err) return html
获取数据 1 2 3 4 5 6 7 8 9 findLink=re.compile(r'<a href="(.*?)">' ) findImg=re.compile(r'<img.*src="(.*?)"' ,re.S) fingTitle=re.compile(r'<span class="title">(.*?)</span>' ) fingRating=re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>' ) findCNumber=re.compile(r'<span>(\d*)人评价</span>' ) findDics=re.compile(r'<span class="inq">(.*?)</span>' ,re.S) findInfo=re.compile(r'<p class="">(.*?)</p>' ,re.S)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 def getData (baseurl) : datalist=[] c=0 for i in range(0 ,10 ): url=baseurl+str(i*25 ) html=askUrl(url) soup = BeautifulSoup(html,"lxml" ) data = [] for item in soup.find_all('div' , class_="item" ): item=str(item) link=re.findall(findLink, item)[0 ] img=re.findall(findImg,item)[0 ] title=re.findall(fingTitle,item) ranting=re.findall(fingRating,item)[0 ] cnumber=re.findall(findCNumber,item)[0 ] dic = re.findall(findDics, item) info = re.findall(findInfo,item)[0 ] info=re.sub('<br(\s+)?/>(/s+)?' ,"" ,info) info=re.sub('/' ,"" ,info) data.append(link) data.append(img) if len(title) == 2 : ctitle = title[0 ] etitle = title[1 ] data.append(ctitle) data.append(etitle) else : ctitle = title[0 ] etitle = '' data.append(ctitle) data.append(etitle) data.append(ranting) data.append(cnumber) if len(dic) == 0 : data.append('' ) else : dic=dic[0 ].replace("。" ,"" ) data.append(dic) data.append(info.strip()) c+=1 datalist.append(data) data=[] for items in datalist: print(items) return datalist
储存数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 def saveDate (datalist,path) : book = xlwt.Workbook(encoding='utf-8' ) sheet = book.add_sheet('豆瓣电影Top250' ,cell_overwrite_ok=True ) print("test" ) col=('链接' ,'图片链接' ,'中文名' ,'外文名' ,'评分' ,'评价人数' ,'描述' ,'信息' ) for i in range(0 ,8 ): sheet.write(0 ,i,col[i]) for i in range(0 ,250 ): print("%d条" %i) data=datalist[i] for j in range(0 ,8 ): sheet.write(i+1 ,j,data[j]) book.save('豆瓣Top250.xls' )
主方法 1 2 3 4 5 def main(): baseurl="https://movie.douban.com/top250?start=" datalist=getData(baseurl) #获取数据 path = ".\\" #存储路径 (暂时直接储存在豆瓣Top250.xls中了,这里没用到) saveDate(datalist,path) #存储数据
至此,初步爬取数据完毕,并储存到 豆瓣Top250.xls
中
1 2 3 if __name__ == "__main__" : main() print('爬取完毕' )