1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
|
import urllib3 import re from bs4 import BeautifulSoup from time import sleep from lxml import etree
def getIpPool(url): http = urllib3.PoolManager( headers={ 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', } ) res = http.request('GET', url) return res.data
ip_pool = [] ip_pool_max = 1
while(len(ip_pool) < ip_pool_max):
res = getIpPool('https://www.xicidaili.com/nn/'+str(1)).decode('utf-8') text_html = BeautifulSoup(res,'lxml') ip_list = text_html.find_all(id='ip_list')[0] print(ip_list) ip_arr = re.findall('img(.*?)</td>', ip_list)
ip_pool.append('aaa')
http = urllib3.PoolManager() res = http.request('GET','http://www.baidu.com') soup = BeautifulSoup(res.data.decode('utf-8'),'html.parser')
http = urllib3.PoolManager() res = http.request('GET', 'http://www.baidu.com/') html = res.data.decode('utf-8')
soup = BeautifulSoup(html, 'lxml')
print(type(soup.a)) print(soup.a) print(soup.a['href']) print(soup.a.attrs)
|