1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
| """ @author: Gary Wang """ import requests import xlwt import time from bs4 import BeautifulSoup
def get_html(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: print('读取网页成功!') return response.text else: print('读网页失败,无数据!') return None
def parse_html(html): soup = BeautifulSoup(html, 'lxml') i = 0 for item in soup.select('tr')[3:-1]: try: yield { 'issue': item.select('td')[i].text, 'WinningNumbers': item.select('td')[i + 1].text, 'sum': item.select('td')[i + 2].text, 'Totalsales': item.select('td')[i + 3].text, 'Direct': item.select('td')[i + 4].text, 'Direct_bonus': item.select('td')[i + 5].text, 'three_selection': item.select('td')[i + 6].text, 'three_selection_bonus': item.select('td')[i + 7].text, 'six__selection': item.select('td')[i + 8].text, 'six__selection_bonus': item.select('td')[i + 9].text, 'time': item.select('td')[i + 10].text } except IndexError: pass
def write_to_excel(): f = xlwt.Workbook() sheet1 = f.add_sheet('pl3', cell_overwrite_ok=True) row0 = ['期号', '中奖号码', '总和', '总销售额(元)', '直选注数', '直选奖金', '组选3注数', '组选3奖金', '组选6注数', '组选6奖金', '开奖日期'] for j in range(0, len(row0)): sheet1.write(0, j, row0[j])
i = 0 url = 'http://datachart.500.com/pls/history/inc/history.php?limit=15116&start=04001&end=19117' html = get_html(url) print('正在提取保存数据......') if html != None: ''' 调用自定义函数分析提取网页数据,保存到excel对象表中,item是循环中所调用的parse_html函数里的对象,是一个字典类型数据 就是提取其它函数的item对象数据,写到excel表对象里 ''' for item in parse_html(html): if item['three_selection'] == ' ': item['three_selection'] = '0' item['three_selection_bonus'] = '0' else: item['six__selection'] = '0' item['six__selection_bonus'] = '0'
sheet1.write(i + 1, 0, item['issue']) sheet1.write(i + 1, 1, item['WinningNumbers']) sheet1.write(i + 1, 2, item['sum']) sheet1.write(i + 1, 3, item['Totalsales']) sheet1.write(i + 1, 4, item['Direct']) sheet1.write(i + 1, 5, item['Direct_bonus']) sheet1.write(i + 1, 6, item['three_selection']) sheet1.write(i + 1, 7, item['three_selection_bonus']) sheet1.write(i + 1, 8, item['six__selection']) sheet1.write(i + 1, 9, item['six__selection_bonus']) sheet1.write(i + 1, 10, item['time']) i += 1
try: f.save('pl3.xls') print('写入EXCEL表pl3.xls成功!') except: print('写入EXCEL表失败')
def main(): write_to_excel()
if __name__ == '__main__': main()
|