如何用 Python 实现 Web 抓取
2016-08-07 · 百度知道合伙人官方认证企业
育知同创教育
1【专注:Python+人工智能|Java大数据|HTML5培训】 2【免费提供名师直播课堂、公开课及视频教程】 3【地址:北京市昌平区三旗百汇物美大卖场2层,微信公众号:yuzhitc】
向TA提问
关注
展开全部
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# by carlin.wang
# 请参考
import urllib
import urllib2
import time
import os
import random
from bs4 import BeautifulSoup
def get_Html(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"}
req = urllib2.Request(url,headers=headers)
res = urllib2.urlopen(req)
res_html = res.read().decode('UTF-8')
return res_html
def urlPages(page):
url = "http://jandan.net/ooxx/page-" + str(page) + '#comments'
return url
def find_img_url(html):
soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
img_urls = soup.find_all(class_='view_img_link')
return img_urls
def download_img(url):
fdir = "D:/data/jiandan"
if not os.path.exists(fdir):
os.makedirs(fdir)
try:
#(p2) = os.path.split(url)
#(p2, f2) = os.path.split(url)
f2 = ''.join(map(lambda xx:(hex(ord(xx))[2:]),os.urandom(16))) #随机字符串作为文件名字,防止名字重复
#if os.path.exists(fdir + "/" + f2):
# print "fdir is exists"
if url:
imgtype = url.split('/')[4].split('.')[1]
filename,msg = urllib.urlretrieve(url, fdir + "/" + f2 + '.' + imgtype)
if os.path.getsize(filename) < 100:
os.remove(filename)
except Exception,e:
return "down image error!"
def run():
for page in range(2001,2007):
html = get_Html(urlPages(page))
urls = find_img_url(html)
for url in urls:
s = url.get('href')
print s
download_img(s)
if __name__ == '__main__':
run()
# -*- coding: utf-8 -*-
# by carlin.wang
# 请参考
import urllib
import urllib2
import time
import os
import random
from bs4 import BeautifulSoup
def get_Html(url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"}
req = urllib2.Request(url,headers=headers)
res = urllib2.urlopen(req)
res_html = res.read().decode('UTF-8')
return res_html
def urlPages(page):
url = "http://jandan.net/ooxx/page-" + str(page) + '#comments'
return url
def find_img_url(html):
soup = BeautifulSoup(html,'html.parser',from_encoding='utf-8')
img_urls = soup.find_all(class_='view_img_link')
return img_urls
def download_img(url):
fdir = "D:/data/jiandan"
if not os.path.exists(fdir):
os.makedirs(fdir)
try:
#(p2) = os.path.split(url)
#(p2, f2) = os.path.split(url)
f2 = ''.join(map(lambda xx:(hex(ord(xx))[2:]),os.urandom(16))) #随机字符串作为文件名字,防止名字重复
#if os.path.exists(fdir + "/" + f2):
# print "fdir is exists"
if url:
imgtype = url.split('/')[4].split('.')[1]
filename,msg = urllib.urlretrieve(url, fdir + "/" + f2 + '.' + imgtype)
if os.path.getsize(filename) < 100:
os.remove(filename)
except Exception,e:
return "down image error!"
def run():
for page in range(2001,2007):
html = get_Html(urlPages(page))
urls = find_img_url(html)
for url in urls:
s = url.get('href')
print s
download_img(s)
if __name__ == '__main__':
run()
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询