亚米网爬虫

  1. 写这个的时候发现了一个小细节,不加headers的情况下,列表页里面一页的数据是20,加了正常的headers之后数据就和浏览器中浏览到的商品数一致了,为48。虽然页面内商品数量和浏览器观察的不一致,但是分类内商品总数没有变化,也就是页数变多了,为了效率更高一点,还是加上了headers。怀疑是wap页面,但是也没有找到网站的wap站,所以不了了之。
  2. 另外就是半夜1点多写好了之后,看起来没问题了跑起来了,早上起来发现报错,对方服务器积极拒绝,所以把延时0.5秒去掉了,加了个死循环,这样还靠谱一点。
  3. sqlite数据库我记得不需要完整路径,但是测试数据库插入的时候,报错为找不到表,改为绝对路径后错误消失,暂不清楚原因。

下面为执行截图

import requests
import csv
import time
import sqlite3
import re
from lxml import etree

class yami:
	def __init__(self):
		self.site = 'http://www.yamibuy.com/cn/'
		self.headers = {
			'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
		}
		self.CONN = sqlite3.connect('F:\python\yamibuy\yami.db')
		self.CUR = self.CONN.cursor()
		
		self.main_loop()

	#主循环
	def main_loop(self):
		#遍历分类
		for cat in self.get_list():
			print(cat)
			#遍历分页
			for page in range(1,555):
				url = 'http://www.yamibuy.com/cn/{0}&brands=&brand_name_list=&sort_order=0&sort_by=3&page={1}'.format(cat[3],page)
				data = self.list_page(cat,url,page)
				if len(data) == 0:
					break
				self.goods_sql(data)
			print(cat[:3], 'done')
	
	#插入数据
	def goods_sql(self,data):
		for item in data:
			sql = 'INSERT INTO "main"."yami_0228" ("name", "price", "url", "cat1", "cat2", "cat3", "brand", "brand_c", "weight", "comment") VALUES (\'{0}\');'.format("','".join(item))
			self.CUR.execute(sql)
			self.CONN.commit()

	#获取列表页
	def list_page(self, cat, url, page):
		time.sleep(1)
		html_str = requests.get(url,headers=self.headers).text
		html = etree.HTML(html_str)
		goods_list = html.xpath('//*[@id="itemContainer"]/div[1]/div/div/a[3]')
		if len(goods_list) == 0:
			return []
		print('page', page, len(goods_list))
		result = []
		for goods in goods_list:
			goods_url = self.site + goods.get('href')
			goods_page = self.goods_page(goods_url)
			result.append((
				goods.xpath('div/p[1]/text()')[0].replace("'",'-'),
				#goods_price
				goods.xpath('div/p[2]/text()')[0],
				goods_url
			) + cat[:3] + goods_page
			)
		return result
	
	#获取商品详情页
	def goods_page(self,goods_url):
		# print(goods_url)
		# time.sleep(0.5)
		flag = True
		while flag:
			try:
				html_str = requests.get(goods_url,headers=self.headers).text
				flag = False
			except:
				pass
			
		html = etree.HTML(html_str)
		
		comment = html.xpath('/html/body/div[1]/div[4]/div/div[2]/div[2]/a[1]/text()')
		comment = self.check(comment,'0')
		if comment != '0':
			comment = re.findall('\d+',comment)[0]
		
		brand = html.xpath('//div[@class="selling-points"]/p[1]/a/text()')
		brand = self.check(brand,'')
		brand = brand.replace("'",'-')
		
		brand_c = html.xpath('//div[@class="selling-points"]/p[2]/text()')
		brand_c = self.check(brand_c,'')

		weight = html.xpath('//div[@class="selling-points"]/p[3]/text()')
		weight = self.check(weight,'')
		weight = weight.replace("'",'-')
		
		return (brand,brand_c,weight,comment)
	
	def check(self,value,preset):
		if len(value) == 0:
			return preset
		else:
			return value[0]
	
	#获取三级分类列表
	def get_list(self):
		html_str = requests.get(self.site).text
		html = etree.HTML(html_str)

		cat_1_list = html.xpath('/html/body/div[1]/div[1]/div[3]/div/div[2]/div')
		for cat_1 in cat_1_list:
			cat_1_name = cat_1.xpath('div[1]/h2/text()')[0]

			cat_2_list = cat_1.xpath('div[2]/div/div[1]/div')
			for cat_2 in cat_2_list:
				cat_2_name = cat_2.xpath('p/a/text()')[0]

				cat_3_list = cat_2.xpath('ul/li/a')
				for cat_3 in cat_3_list:
					cat_3_name = cat_3.text

					yield (cat_1_name,cat_2_name,cat_3_name,cat_3.get('href'))

if __name__ == '__main__':
	yami()
CREATE TABLE "yami_0228" (
"id"  INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"name"  TEXT,
"price"  TEXT,
"url"  TEXT,
"cat1"  TEXT,
"cat2"  TEXT,
"cat3"  TEXT,
"brand"  TEXT,
"brand_c"  TEXT,
"weight"  TEXT,
"comment"  TEXT
);

 

不常做手有点生了

最后修改:2017 年 03 月 26 日 06 : 46 PM

发表评论