1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
   | import re   import urllib.error   import urllib.request import xlwt   from bs4 import BeautifulSoup  
  findTitle = re.compile(r'<strong>.*?</strong>')   findCategory = re.compile(r'<p>.*?<br/>')
 
 
  def getData(baseurl):     datalist = []     html = askURL(baseurl)          bs = BeautifulSoup(html, "html.parser")
           for item in bs.find_all('div', class_="a0"):         data = []           item = str(item)
                   if item == '<div class="a0">收藏数最多的剧</div>':             continue
                   title = re.findall(findTitle, item)[0]         title = re.sub("</*strong>", "", title)         category = re.findall(findCategory, item)[0].replace("<p>", "")         category = category.replace('<br/>', '')
                   data.append(title)         data.append(category)
                   datalist.append(data)
      return datalist
 
 
  def askURL(url):     head = {         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 Edg/80.0.361.69"     }     request = urllib.request.Request(url, headers=head)     html = ""     try:         response = urllib.request.urlopen(request)         html = response.read().decode()     except urllib.error.URLError as e:         if hasattr(e, "code"):             print(e.code)         if hasattr(e, "reason"):             print(e.reason)
      return html
 
 
  def saveData(savepath, datalist):     workbook = xlwt.Workbook(encoding="utf-8")     worksheet = workbook.add_sheet('sheet1')     row = 0     for item in datalist:         worksheet.write(row, 0, item[0])         worksheet.write(row, 1, item[1])         row = row + 1     workbook.save(savepath)
 
  if __name__ == "__main__":     baseurl = "http://www.rrys2019.com/html/top/total_fav_list.html"     savepath = r"C:\Users\38487\Desktop\Top50.xls"          datalist = getData(baseurl)          saveData(savepath, datalist)
   |