python写的爬虫返回网页总是为乱码,求高手解决
#-*-coding:utf-8-*-importurllibimporturllib2importrandomimportreimportbs4importcharde...
# -*- coding: utf-8 -*-
import urllib
import urllib2
import random
import re
import bs4
import chardet
def getRequest(url,header):
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request)
return response.read()
HEADER={
'Host': 'px.h2o-china.com',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.8.0.12)',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip,deflate',
'Accept-Charset': 'gbk;q=0.7,*;q=0.7',
'Referer': 'http://px.h2o-china.com/2014/view?key=YzBkZFgxdHRncEhMN0lKNlVDUmlkamloNU8xaGlJb204bnJXbXJkWWJhOVFzdGtQRnFWVnpsZDBKb3RoeWFMVzhoUmgxUlF1MElJZExualBaWEZPYUpz&from=singlemessage&isappinstalled=0',
'Cookie': 'PHPSESSID=mofsmmnds17rqueqcjih30k971'
}
html = getRequest('http://px.h2o-china.com/2014/view?key=MjgxMW1pYmhPR1oxZVFKSytEUjBnSHVkY3FEUnJiU0tWWXBWMWpUYWlxYk4zRFJycGZxVDhQKzd1a1ZBVVNaZHdMQ3NRbXR2T2Zxd1VNTVVYamdXcTcw',HEADER)
encodeofhtml= chardet.detect(html)
content = bs4.BeautifulSoup(html)
print encodeofhtml
print content
网上搜到的各种方法都试了,还是不行 展开
import urllib
import urllib2
import random
import re
import bs4
import chardet
def getRequest(url,header):
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request)
return response.read()
HEADER={
'Host': 'px.h2o-china.com',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.8.0.12)',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip,deflate',
'Accept-Charset': 'gbk;q=0.7,*;q=0.7',
'Referer': 'http://px.h2o-china.com/2014/view?key=YzBkZFgxdHRncEhMN0lKNlVDUmlkamloNU8xaGlJb204bnJXbXJkWWJhOVFzdGtQRnFWVnpsZDBKb3RoeWFMVzhoUmgxUlF1MElJZExualBaWEZPYUpz&from=singlemessage&isappinstalled=0',
'Cookie': 'PHPSESSID=mofsmmnds17rqueqcjih30k971'
}
html = getRequest('http://px.h2o-china.com/2014/view?key=MjgxMW1pYmhPR1oxZVFKSytEUjBnSHVkY3FEUnJiU0tWWXBWMWpUYWlxYk4zRFJycGZxVDhQKzd1a1ZBVVNaZHdMQ3NRbXR2T2Zxd1VNTVVYamdXcTcw',HEADER)
encodeofhtml= chardet.detect(html)
content = bs4.BeautifulSoup(html)
print encodeofhtml
print content
网上搜到的各种方法都试了,还是不行 展开
1个回答
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询
广告 您可能关注的内容 |