0.在开发一些互联网项目的时候,常常需要用脚本去网上抓取一些原始数据。今天写的用python脚本去下载一些图片。原理也很简单,就是读取网页页面,用正则匹配分析图片路径,批量下载。主要用的模块urllib,re,BeautifulSoup。需要注意的是python代码有严格的格式要求,抓取时速度不能太快了,大部分网站都会有限制ip的。另外,大部分网站的数据拉取会用js隐藏起来,很难通过页面分析出来批量处理,通过分析js代码也很难做到,这时我们可以借助浏览器,比如chrome的开发者工具,可以查看网络(network)的所有请求即响应。这便很容易分析出想要的信息。

1.导入包,定义全局变量

#!/usr/bin/python
#encoding: utf-8

import time,math,os,re,urllib,urllib2,cookielib
from bs4 import BeautifulSoup

class Image(object):
  image_links = [] #图片URL链接
  image_dir = '/data/image/' #图片存放文件夹
  image_count = 0 #已下载图片数量
  current_pages = [] #页面链接地址

2.浏览器信息

def __init__(self):
  self.cj =cookielib.LWPCookieJar()
  self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
  urllib2.install_opener(self.opener)
  self.opener.addheaders = [
	("User-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.22 (KHTML, like Gecko) Ubuntu Chromium/25.0.1364.160"),
	("Accept", "*/*")]

3.获取页面地址

def get_page_link(self,url):
  self.current_pages = []
  try:
	html = self.opener.open(url).read()
	ids = []
	names = []
	for pid in re.findall(r'\.id=\d*',str(html)):
	  ppid = re.findall(r'\d{1,}',pid)[0]
	  ids.append(ppid)
	for pname in re.findall(r'\.domainName="[A-Za-z0-9_-]*"',str(html)):
	  ppname = re.findall(r'="[A-Za-z0-9_-]{1,}',pname)[0]
	  pppname = re.findall(r'[A-Za-z0-9_-]{1,}',ppname)[0]
	  names.append(pppname)
	for i in range(0,len(ids)):
	  page = 'http://pp.xxx.com/'+ names[i] + '/pp/' + str(ids[i]) + '.html'
	  self.current_pages.append(page)
	except Exception, e:
	  self.write_log('get page link error:%s' %e)
	  return

4.获取页面图片地址

def get_images_link(self):
  self.image_links = []
  for page_link in self.current_pages:
	try:
	  html = self.opener.open(page_link).read()
	except Exception, e:
	  self.write_log('get image link error:%s' %e)
	  print 'get image link error:%s' %e
	  return
	soup = BeautifulSoup(html)
	for link in soup.findAll('img',{'data-lazyload-src':re.compile('http://img2.xxx.net/')}):
	  if 'data-lazyload-src="http://img2.xxx.net/' in str(link):
		l = link['data-lazyload-src']
		#l = re.findall(r'.*data-lazyload-src=(.*)',link['href'])[0]
		self.image_links.append(l)

5.图片下载

def download(self):
  if not self.image_links:
	return False
  for link in self.image_links:
	try:
	  data = urllib.urlopen(link).read()
	except Exception, e:
	  self.write_log('connect error:%s' %e)
	  continue
	self.write_log('downloading ... - %s' %link)
	file_name = str(int(time.time())) + '.jpg'
	file_path = os.path.join(self.image_dir,file_name)
	image = open(file_path,'wb')
	try:
	  image.write(data)
	except Exception, e:
	  self.write_log('faild:%s' %e)
	else:
	  self.write_log('success:%s' %link)
	self.image_count += 1
	image.close
	del image
	time.sleep(2)
  pass

6.日志

def write_log(self,text):
	print text
	log = open('log.txt','a')
	log.write(text)
	log.write('\n')
	log.close

7.运行

def run(self,url):
	self.get_page_link(url)
	self.get_images_link()
	self.download()

8.main方法

if __name__ == '__main__':
    for i in range(1,500): # page range 1 to 500
	url = 'http://www.xxxx.com/?page=' + str(i)
	page_log = '+++++++++++++++++++++ page ' + str(i) + '+++++++++++++++'
	print page_log
	img = Image()
	img.write_log(page_log)
	img.run(url)