以豆瓣TOP250为例 https://movie.douban.com/top250  ,该网页包含250条电影内容,25条为一页,每页链接为 https://movie.douban.com/top250?start= 传入参数可为包含的任何数,按照25条一页,传入参数为 0 25 50 ... 225 
调用库 需要用到以下库
1 2 3 4 5 from  bs4 import  BeautifulSoup       import  urllib.requestimport  xlwt         import  re           import  sqlite3      
请求网页 1 2 3 4 5 6 7 8 9 10 11 12 def  askUrl (url) :    headers = {         "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWe bKit/537.36 (KHTML, like Gecko) Chrome/93.0.4544.0 Safari/537.36 Edg/93.0.933.1" ,     }     request = urllib.request.Request(url,headers=headers)       try :         response = urllib.request.urlopen(request)           html = response.read().decode("utf-8" )     except  Exception as  err:         print("出现错误" ,err)     return  html 
获取数据 1 2 3 4 5 6 7 8 9 findLink=re.compile(r'<a href="(.*?)">' ) findImg=re.compile(r'<img.*src="(.*?)"' ,re.S)  fingTitle=re.compile(r'<span class="title">(.*?)</span>' ) fingRating=re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>' ) findCNumber=re.compile(r'<span>(\d*)人评价</span>' ) findDics=re.compile(r'<span class="inq">(.*?)</span>' ,re.S) findInfo=re.compile(r'<p class="">(.*?)</p>' ,re.S) 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 def  getData (baseurl) :    datalist=[]        c=0      for  i in  range(0 ,10 ):           url=baseurl+str(i*25 )         html=askUrl(url)         soup = BeautifulSoup(html,"lxml" )          data = []             for  item in  soup.find_all('div'  , class_="item" ):                          item=str(item)             link=re.findall(findLink, item)[0 ]             img=re.findall(findImg,item)[0 ]             title=re.findall(fingTitle,item)             ranting=re.findall(fingRating,item)[0 ]             cnumber=re.findall(findCNumber,item)[0 ]             dic = re.findall(findDics, item)             info = re.findall(findInfo,item)[0 ]             info=re.sub('<br(\s+)?/>(/s+)?' ,"" ,info)             info=re.sub('/' ,"" ,info)             data.append(link)             data.append(img)             if  len(title) == 2 :                    ctitle = title[0 ]                 etitle = title[1 ]                 data.append(ctitle)                 data.append(etitle)             else :                 ctitle = title[0 ]                 etitle = ''                  data.append(ctitle)                 data.append(etitle)             data.append(ranting)             data.append(cnumber)             if  len(dic) == 0 :                    data.append('' )             else :                 dic=dic[0 ].replace("。" ,"" )                 data.append(dic)             data.append(info.strip())             c+=1              datalist.append(data)             data=[]     for  items in  datalist:         print(items)     return  datalist 
储存数据 1 2 3 4 5 6 7 8 9 10 11 12 13 14 def  saveDate (datalist,path) :    book = xlwt.Workbook(encoding='utf-8' )     sheet = book.add_sheet('豆瓣电影Top250' ,cell_overwrite_ok=True )     print("test" )     col=('链接' ,'图片链接' ,'中文名' ,'外文名' ,'评分' ,'评价人数' ,'描述' ,'信息' )     for  i in  range(0 ,8 ):         sheet.write(0 ,i,col[i])      for  i in  range(0 ,250 ):         print("%d条" %i)         data=datalist[i]         for  j in  range(0 ,8 ):             sheet.write(i+1 ,j,data[j])     book.save('豆瓣Top250.xls' ) 
主方法 1 2 3 4 5 def main():     baseurl="https://movie.douban.com/top250?start="     datalist=getData(baseurl) #获取数据     path = ".\\" #存储路径 (暂时直接储存在豆瓣Top250.xls中了,这里没用到)     saveDate(datalist,path) #存储数据 
至此,初步爬取数据完毕,并储存到 豆瓣Top250.xls 中
1 2 3 if  __name__ == "__main__" :    main()     print('爬取完毕' )