Python爬虫用户代理池
# 用户代理池
# 如果经常用一个浏览器伪装爬,很容易被发现,这时候用多个代理(多个浏览器)访问
# 用户代理池:用多个浏览器标识构成一个集合,相当于一个池子,随机使用一个
# 让网站以为很多人访问,网站更难识别
import urllib.request #爬网页先导入这个
import random # 需要导入随机模块
uapools= [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
]
def UA() :
opener=urllib.request.build_opener()
thisua=random.choice(uapools)
ua=("User-Agent",thisua)
opener.addheaders=[ua]
urllib.request.install_opener(opener)
print("当前使用UA:"+str(thisua))
url="https://www.qiushibaike.com/"
for i in range(0,10) :
UA()
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
print(len(data))
# 思考:如何实现每爬3次换一次uA
for i in range(0,10):
if (i%3==0):
UA()
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
+++
# 批量爬取糗事百科段子数据
# 目标站点: http://www.qiushibaike.com/
# 目标数据: 热门段子
# 要求: 实现自动翻页
# 做爬虫前要先分析
#首先打开网页源数据,定位要爬的数据,分析上下文,找到标志性标签
# 找到<div class="content">...</div>
#翻页功能:观察翻页时网址的变化:https://www.qiushibaike.com/text/page/1/
import urllib.request #爬网页先导入这个
import random # 需要导入随机模块
import re
uapools= [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
]
def UA() :
opener=urllib.request.build_opener()
thisua=random.choice(uapools)
ua=("User-Agent",thisua)
opener.addheaders=[ua]
urllib.request.install_opener(opener)
#print("当前使用UA:"+str(thisua))
for i in range(0,35) :
if (i%3==0):
UA()
thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/"
#data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
try:
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>'
rst=re.compile(pat,re.S).findall(data)
for j in range(0,len(rst)):
print(rst[j])
print("------")
except Exception as err:
pass
# 将爬到的内容写入文件
# 前面相同
ff1=open("e:/qiushi.txt","w")
ff1.write("")
pK=0
for i in range(0,35) :
if (i%3==0):
UA()
thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/"
#data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
try:
data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>'
rst=re.compile(pat,re.S).findall(data)
for j in range(0,len(rst)):
pK=pK+1
print(str(i),str(j))
ff1.write(str(pK)+"\r\n") # 文件中写入换行要加\r\n
ff1.write("-------\r\n") #print("------")
ff1.write(str(rst[j])+"\r\n") #print(rst[j])
except Exception as err:
print(err)
ff1.closeend;
Tag: Python 用户代理池 浏览器伪装