请教各位,python编写爬虫,返回http error 521怎么解决
展开全部
原博主用的是PyV8执行JS代码,我换了PyExecJS
import execjs
import re
import requests
url = ""
HERDERS = {
"Host": "www.kuaidaili.com",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
}
def executejs(html):
# 提取其中的JS加密函数
js_string = ''.join(re.findall(r'(function .*?)</script>',html))
# 提取其中执行JS函数的参数
js_func_arg = re.findall(r'setTimeout\(\"\D+\((\d+)\)\"', html)[0]
js_func_name = re.findall(r'function (\w+)',js_string)[0]
# 修改JS函数,使其返回Cookie内容
js_string = js_string.replace('eval("qo=eval;qo(po);")', 'return po')
func = execjs.compile(js_string)
return func.call(js_func_name,js_func_arg)
def parse_cookie(string):
string = string.replace("document.cookie='", "")
clearance = string.split(';')[0]
return {clearance.split('=')[0]: clearance.split('=')[1]}
# 第一次访问获取动态加密的JS
first_html = requests.get(url=url,headers=HERDERS).content.decode('utf-8')
# 执行JS获取Cookie
cookie_str = executejs(first_html)
# 将Cookie转换为字典格式
cookie = parse_cookie(cookie_str)
print('cookies = ',cookie)
# 带上cookies参数,再次请求
response = requests.get(url=url,headers=HERDERS,cookies=cookie)
print(response.status_code)
import execjs
import re
import requests
url = ""
HERDERS = {
"Host": "www.kuaidaili.com",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
}
def executejs(html):
# 提取其中的JS加密函数
js_string = ''.join(re.findall(r'(function .*?)</script>',html))
# 提取其中执行JS函数的参数
js_func_arg = re.findall(r'setTimeout\(\"\D+\((\d+)\)\"', html)[0]
js_func_name = re.findall(r'function (\w+)',js_string)[0]
# 修改JS函数,使其返回Cookie内容
js_string = js_string.replace('eval("qo=eval;qo(po);")', 'return po')
func = execjs.compile(js_string)
return func.call(js_func_name,js_func_arg)
def parse_cookie(string):
string = string.replace("document.cookie='", "")
clearance = string.split(';')[0]
return {clearance.split('=')[0]: clearance.split('=')[1]}
# 第一次访问获取动态加密的JS
first_html = requests.get(url=url,headers=HERDERS).content.decode('utf-8')
# 执行JS获取Cookie
cookie_str = executejs(first_html)
# 将Cookie转换为字典格式
cookie = parse_cookie(cookie_str)
print('cookies = ',cookie)
# 带上cookies参数,再次请求
response = requests.get(url=url,headers=HERDERS,cookies=cookie)
print(response.status_code)
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询