简单的京东爬虫
京东爬虫
参考: Python爬虫:爬取京东商品简介.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Time : 2021/8/6
import requests
from bs4 import BeautifulSoup
import xlwt
import time
class Config():
'''
PARAMETER_LABEL:需要根据爬取的商品详情界面的信息key进行修改,其中'评论数'是另一种方式获取的
TITLE_LABEL:PARAMETER_LABEL修改后excel的表头也做相应的修改
GOOD_LABEL:无需修改,默认该四个信息都爬取
need_list:能直接从html解析后的标签文件中获取的
'''
# excel表头
TITLE_LABEL = ['商品名称', '价格', '商家', '商品详情地址', '工艺', '图案', '风格', '材质', '形状', '直径', '评论数']
# html中对应TITLE_LABEL的key
GOOD_LABEL = ['name', 'price', 'shop', 'detail_addr']
# TITLE_LABEL中商品详情页(即点进单个商品界面)想爬取的数据在html中key,'评论数'一定要放在最后
PARAMETER_LABEL = ['工艺', '图案', '风格', '材质', '形状', '直径', '评论数']
# 将PARAMETER_LABEL去掉'评论数'即为need_list
need_list = PARAMETER_LABEL[:-1]
# 将搜索页的key和单个商品详情页的key组合起来
TOTAL_LABEL = GOOD_LABEL + PARAMETER_LABEL
# excel文件的保存路径
SAVE_PATH = './test.xls'
# 发送访问请问的head文件
# 每个电脑每个京东账号对应的head文件不同,获取方式参考帖子https://blog.csdn.net/weixin_41998772/article/details/106476166
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
}
# 搜索关键字
keyword = str(input('请输入需要爬取的信息关键字:'))
total = input('请输入需要爬取页数: ')
class Excel():
# 表格列数
TABLE_COL = len(Config.TITLE_LABEL)
# 当前行数
_current_row = 1
# 初始化,创建文件及写入title
def __init__(self, sheet_name='sheet1'):
self.write_work = xlwt.Workbook(encoding='ascii')
self.write_sheet = self.write_work.add_sheet(sheet_name)
for item in range(len(Config.TITLE_LABEL)):
# 第一行写入excel表头
self.write_sheet.write(0, item, label=Config.TITLE_LABEL[item])
# 写入内容
def write_content(self, content):
print(content)
if content['detail_addr'] != '无': # 有时候没能获取的该商品的详情地址就跳过该商品
for item in range(self.TABLE_COL):
if (item == self.TABLE_COL - 1) and (Config.TOTAL_LABEL[-1] == '标题材质关键字'):
self.write_sheet.write(self._current_row, item, label=self.title_extract(content['name']))
else:
self.write_sheet.write(self._current_row, item, label=content[Config.TOTAL_LABEL[item]])
# 插入完一条记录后,换行
self._current_row += 1
# 保存文件
def save_file(self, file_url=Config.SAVE_PATH):
try:
self.write_work.save(file_url)
print("文件保存成功!文件路径为:" + file_url)
except IOError:
print("文件保存失败!")
# 提取商品标题中的材质关键词
def title_extract(self, title):
# 想要提取的关键词列表
materials = ['陶瓷', '骨瓷', '玻璃', '搪瓷', '木制', '木质', '不锈钢', '塑料']
contain = ''
count = 0
for material in materials:
if material in title:
# 将最后的输入形式为 “陶瓷、木质、不锈钢”
if count == 0:
contain = contain + material
count += 1
else:
contain = contain + '、' + material
count += 1
return contain
class Goods:
# 初始化方法
def __init__(self, li_info):
self.li_info = li_info
self.good_info_dic = {}
def acquire_comment(self, url):
'''
input:
url:商品详情地址(detail_addr),形式如//item.jd.com/100007046969.html
rerurn:
comment_count:该商品的评论数,现在能爬取到的都是大约数,比如“2万+”,详细的评论总数京东暂时没显示在html信息中(2021.08.03)
'''
# 提取商品详情地址中的商品号
no = url.split('com/')[1].split('.html')[0]
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + no + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1"
print("评论数获取链接:", comment_url)
response = requests.get(comment_url, headers=Config.headers)
time.sleep(2)
page = response.content.decode(
'gbk') # type(page)为str,fetchJSON_comment98(dic),dic['productCommentSummary']['commentCountStr']为评论数
# "commentCountStr":"2万+", 获取其中的2万+,暂时想到的办法是用split和replace对字符串进行切分后再替换不需要的字符
comment_count = page.split("commentCountStr")[1].split(':')[1].split(',')[0].replace('"', '')
return comment_count
def add_product_parameter(self, need_list, url):
html = get_html('https:' + url)
soup = BeautifulSoup(html, 'lxml')
time.sleep(2)
# 获取商品参数
parameters = soup.find('ul', class_='parameter2 p-parameter-list')
para_lists = parameters.find_all('li')
name_lists = []
para_text_lists = []
for para in para_lists:
para_text = para.get_text()
#print(para_text)
# para_text的形式:“商品名称:浩雅HY160”
name_lists.append(para_text.split(":")[0]) # name_lists保存单个商品详情页参数名称,如“商品名称”
para_text_lists.append(para_text.split(":")[1]) # para_text_lists保存参数名称对应的参数,如”浩雅HY160“
return_list = []
# 按need_list中参数名称的顺序保存单个商品详情页中的爬虫数据
for need in need_list[:-1]: # 评论数单独拎出来,need_list[-1]为“评论数”
try:
index = name_lists.index(need)
return_list.append(para_text_lists[index])
except:
# 如果该商品商家并没有显示该参数名称的参数,那么excel中填充空值
return_list.append(' ')
# 最后一列填充评论数
return_list.append(self.acquire_comment(url))
return return_list
def find_attr(self, attr):
try:
if attr == Config.GOOD_LABEL[0]:
# 商品名称
result = self.li_info.find(class_='p-name p-name-type-2').find('em').get_text()
elif attr == Config.GOOD_LABEL[1]:
# 价格
result = self.li_info.find(class_='p-price').find('i').get_text()
elif attr == Config.GOOD_LABEL[2]:
# 商家
result = self.li_info.find(class_='p-shop').find('a').get_text()
elif attr == Config.GOOD_LABEL[3]:
# 商品详情地址
result = self.li_info.find(class_='p-name p-name-type-2').find('a')['href']
# 进入单个商品详情网页进行数据爬取,本代码所说的单个商品详情网页意思为从搜索页点进某一个商品页,比如https://item.jd.com/100007046969.html
paras = self.add_product_parameter(Config.PARAMETER_LABEL, result)
for i in range(len(paras)):
para = paras[i]
self.good_info_dic.setdefault(Config.PARAMETER_LABEL[i], para)
except AttributeError:
result = '无'
self.good_info_dic.setdefault(attr, result) # 集合setdefault
# 添加商品信息
def add_good_info(self):
for item in Config.GOOD_LABEL:
self.find_attr(item)
# 获取产品列表
def get_good(self):
return self.good_info_dic
def get_html(url, currentPage=None, pageSize=None):
if pageSize:
print("--> 正在获取网站第 " + str(currentPage) + "页信息")
if currentPage != 1:
url = url + '&page=' + str(currentPage) + '&s=' + str(pageSize) + '&click=0'
response = requests.get(url, headers=Config.headers) # 请求访问网站
time.sleep(2)
if response.status_code == 200:
html = response.text # 获取网页源码
return html # 返回网页源码
else:
print("获取网站信息失败!")
if __name__ == '__main__':
'''
一定需要修改的是Config中的headers文件,每个电脑每个京东账号对应的文件不同
修改完headers后可以输入关键词为“餐具碗”,页数“2”,查看下爬虫结果
可根据自己的需求修改Config中的参数
'''
# 创建文件
excel = Excel()
config = Config()
# 搜索地址
search_url = 'https://search.jd.com/Search?keyword=' + config.keyword + '&enc=utf-8&psort=3'
page = {
'total': 0, # 总页数
'currentPage': 1, # 当前页数
'pageSize': 0 # 每页显示多少条
}
if not config.total.isdigit():
print("非法字符,程序退出!")
exit(0)
page['total'] = eval(config.total)
for i in range(page['total']):
# 初始化BeautifulSoup库,并设置解析器
soup = BeautifulSoup(get_html(search_url, page['currentPage'], page['currentPage'] * page['pageSize']), 'lxml')
time.sleep(2) # 为了防止爬取太快被京东服务器拦截,在每次解析网页操作后强制休息2秒
# 商品列表
goods_list = soup.find_all('li', class_='gl-item')
print("分析到第" + str(page['currentPage']) + '页共有' + str(len(goods_list)) + '条商品信息')
for li in goods_list: # 遍历父节点
try:
goods = Goods(li)
# 添加信息
goods.add_good_info()
# 获取信息
good_info = goods.get_good()
# 写入excel
excel.write_content(good_info)
except:
print("商品信息获取失败")
break
page['currentPage'] = page['currentPage'] + 1
page['pageSize'] = len(goods_list) * page['currentPage']
# 保存excel文件
excel.save_file(config.SAVE_PATH)
爬虫结果

更新
爬虫代码
下方例子可直接爬取“水杯”,并将每个商品的图片保存至本地。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Time : 2023/7/14
import requests
from bs4 import BeautifulSoup
import xlwt
import time
import urllib.request
from tqdm import tqdm
from xlrd import open_workbook
import os
class Config():
'''
PARAMETER_LABEL:需要根据爬取的商品详情界面的信息key进行修改,其中'评论数'是另一种方式获取的
TITLE_LABEL:PARAMETER_LABEL修改后excel的表头也做相应的修改
GOOD_LABEL:无需修改,默认该四个信息都爬取
need_list:能直接从html解析后的标签文件中获取的
'''
# excel表头
TITLE_LABEL = ['商品名称', '价格', '商家', '商品详情地址', '图片', '商品毛重', '商品产地', '类别', '容量',
'适用场景', '附加组件', '使用人群', '国产/进口', '是否带杯盖', '材质', '杯身材质', '功能', '评论数']
# html中对应TITLE_LABEL的key
GOOD_LABEL = ['name', 'price', 'shop', 'detail_addr', 'image']
# TITLE_LABEL中商品详情页(即点进单个商品界面)想爬取的数据在html中key,'评论数'一定要放在最后
PARAMETER_LABEL = ['商品毛重', '商品产地', '类别', '容量', '适用场景', '附加组件', '使用人群',
'国产/进口', '是否带杯盖', '材质', '杯身材质', '功能','评论数']
# 将PARAMETER_LABEL去掉'评论数'即为need_list
need_list = PARAMETER_LABEL[:-1]
# 将搜索页的key和单个商品详情页的key组合起来
TOTAL_LABEL = GOOD_LABEL + PARAMETER_LABEL
# excel文件的保存路径
SAVE_PATH = './cup.xls'
# 发送访问请问的head文件
# 每个电脑每个京东账号对应的head文件不同,获取方式参考帖子https://blog.csdn.net/weixin_41998772/article/details/106476166
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
# 'accept-language': 'zh-CN,zh;q=0.9',
# }
cookies = {
'__jdv': '76161171|direct|-|none|-|1689218894689',
'__jdu': '16892188946541371543436',
'o2State': '{%22webp%22:true%2C%22avif%22:true}',
'3AB9D23F7A4B3CSS': 'jdd03472A3YFKNZQJ6EHDPSBPDQPS5ROVPXBSHXUY7X2OWRYO46RKKBYNYOHUAY3CR2MOXIBWGG2JIS5IBWFFP4JD2MEX5AAAAAMJJVFN57QAAAAADBMKDMUMGW45EQX',
'_gia_d': '1',
'areaId': '19',
'ipLoc-djd': '19-1666-0-0',
'TrackID': '1LzIjWPkPMVUu06CSqqrNzPmwcFyddXy6t1ne4YXMJCVo8dQEjnAlHhfjw1pha3W8qnq4StU3s2-zPrwHUFn5QggsGxXzckjoy4pLHt7nZOw',
'thor': 'C43F1D5C02A3907E5E5BCDF6D3F20655C53E49D06043DDDE00FE95C8EB5A1BBF18C322D774EDD238443476EDD046A1E62CE1321134F26021F7DD20D989169FA06631F27E73703C5BCE70FF4E2D50BDB35188EAB241F55688682F4F510C3ECDA92ACD564910586C2015691A9EFA9A8244E1565E4DB31E5597273AEE0BAD333A3C3A64CE7F64E1F871296382FD0DE1E105E8C6DC1BFC8807CD814151C769C0C03E',
'flash': '2_OyX7GX17g3lKoYziO9BoOOllWmt4HoX-3_QG9cEOUSddVIQo5m0RrwLqu43vgb26KX1bLNKM5hJxKWrmF7bhbhZd7HBpjTqSG2gRfqhBYHj*',
'pinId': 'JiISwXIgw2P1QvVgGj2fKLV9-x-f3wj7',
'pin': 'jd_5800f4d8105f8',
'unick': '%E5%93%92%E5%93%92%E7%9A%84%E5%B0%8F%E4%B8%80',
'ceshi3.com': '000',
'_tp': 'qmQEay9GtiDAvdnmiflu2C2j1LKDR4y0lT1aSjFQodk%3D',
'_pst': 'jd_5800f4d8105f8',
'__jda': '76161171.16892188946541371543436.1689218895.1689218895.1689218895.1',
'__jdb': '76161171.4.16892188946541371543436|1.1689218895',
'__jdc': '76161171',
'PCSYCityID': 'CN_440000_440600_0',
'shshshfpa': 'b4c2aab0-3952-f8c5-0e18-3ea14805d939-1689218920',
'shshshfpx': 'b4c2aab0-3952-f8c5-0e18-3ea14805d939-1689218920',
'shshshfpb': 'eiIZx9CpqJDcIP1YSZpIrOQ',
'3AB9D23F7A4B3C9B': '472A3YFKNZQJ6EHDPSBPDQPS5ROVPXBSHXUY7X2OWRYO46RKKBYNYOHUAY3CR2MOXIBWGG2JIS5IBWFFP4JD2MEX5A',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': '__jdv=76161171|direct|-|none|-|1689218894689; __jdu=16892188946541371543436; o2State={%22webp%22:true%2C%22avif%22:true}; 3AB9D23F7A4B3CSS=jdd03472A3YFKNZQJ6EHDPSBPDQPS5ROVPXBSHXUY7X2OWRYO46RKKBYNYOHUAY3CR2MOXIBWGG2JIS5IBWFFP4JD2MEX5AAAAAMJJVFN57QAAAAADBMKDMUMGW45EQX; _gia_d=1; areaId=19; ipLoc-djd=19-1666-0-0; TrackID=1LzIjWPkPMVUu06CSqqrNzPmwcFyddXy6t1ne4YXMJCVo8dQEjnAlHhfjw1pha3W8qnq4StU3s2-zPrwHUFn5QggsGxXzckjoy4pLHt7nZOw; thor=C43F1D5C02A3907E5E5BCDF6D3F20655C53E49D06043DDDE00FE95C8EB5A1BBF18C322D774EDD238443476EDD046A1E62CE1321134F26021F7DD20D989169FA06631F27E73703C5BCE70FF4E2D50BDB35188EAB241F55688682F4F510C3ECDA92ACD564910586C2015691A9EFA9A8244E1565E4DB31E5597273AEE0BAD333A3C3A64CE7F64E1F871296382FD0DE1E105E8C6DC1BFC8807CD814151C769C0C03E; flash=2_OyX7GX17g3lKoYziO9BoOOllWmt4HoX-3_QG9cEOUSddVIQo5m0RrwLqu43vgb26KX1bLNKM5hJxKWrmF7bhbhZd7HBpjTqSG2gRfqhBYHj*; pinId=JiISwXIgw2P1QvVgGj2fKLV9-x-f3wj7; pin=jd_5800f4d8105f8; unick=%E5%93%92%E5%93%92%E7%9A%84%E5%B0%8F%E4%B8%80; ceshi3.com=000; _tp=qmQEay9GtiDAvdnmiflu2C2j1LKDR4y0lT1aSjFQodk%3D; _pst=jd_5800f4d8105f8; __jda=76161171.16892188946541371543436.1689218895.1689218895.1689218895.1; __jdb=76161171.4.16892188946541371543436|1.1689218895; __jdc=76161171; PCSYCityID=CN_440000_440600_0; shshshfpa=b4c2aab0-3952-f8c5-0e18-3ea14805d939-1689218920; shshshfpx=b4c2aab0-3952-f8c5-0e18-3ea14805d939-1689218920; shshshfpb=eiIZx9CpqJDcIP1YSZpIrOQ; 3AB9D23F7A4B3C9B=472A3YFKNZQJ6EHDPSBPDQPS5ROVPXBSHXUY7X2OWRYO46RKKBYNYOHUAY3CR2MOXIBWGG2JIS5IBWFFP4JD2MEX5A',
'Referer': 'https://passport.jd.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 搜索关键字
keyword = str(input('请输入需要爬取的信息关键字:'))
total = input('请输入需要爬取页数: ')
class Excel():
# 表格列数
TABLE_COL = len(Config.TITLE_LABEL)
# 当前行数
_current_row = 1
# 初始化,创建文件及写入title
def __init__(self, sheet_name='sheet1'):
if os.path.exists(Config.SAVE_PATH):
# 如果xls文件存在,则接着在表格后增加数据
book = open_workbook(Config.SAVE_PATH, formatting_info=True)
self.write_work = xlwt.Workbook(encoding='ascii')
sheet = book.sheet_by_index(0)
self.write_sheet = self.write_work.add_sheet(sheet_name)
for row in range(sheet.nrows):
for col in range(sheet.ncols):
self.write_sheet.write(row, col, sheet.cell(row, col).value)
self._current_row = sheet.nrows
else:
self.write_work = xlwt.Workbook(encoding='ascii')
self.write_sheet = self.write_work.add_sheet(sheet_name)
for item in range(len(Config.TITLE_LABEL)):
# 第一行写入excel表头
self.write_sheet.write(0, item, label=Config.TITLE_LABEL[item])
# 写入内容
def write_content(self, content):
print(content)
if content['detail_addr'] != '无': # 有时候没能获取的该商品的详情地址就跳过该商品
for item in range(self.TABLE_COL):
self.write_sheet.write(self._current_row, item, label=content[Config.TOTAL_LABEL[item]])
# 插入完一条记录后,换行
self._current_row += 1
# 每新增一条信息后进行保存,以备中途出问题文件未及时保存
excel.save_file(config.SAVE_PATH)
else:
print("商品详细地址未获取!")
# 保存文件
def save_file(self, file_url=Config.SAVE_PATH):
try:
self.write_work.save(file_url)
print("文件保存成功!文件路径为:" + file_url)
except IOError:
print("文件保存失败!")
class Goods:
# 初始化方法
def __init__(self, li_info):
self.li_info = li_info
self.good_info_dic = {}
def acquire_comment(self, url):
'''
input:
url:商品详情地址(detail_addr),形式如//item.jd.com/100007046969.html
rerurn:
comment_count:该商品的评论数,现在能爬取到的都是大约数,比如“2万+”,详细的评论总数京东暂时没显示在html信息中(2021.08.03)
'''
# 提取商品详情地址中的商品号
no = url.split('com/')[1].split('.html')[0]
comment_url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=" + no + "&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1"
response = requests.get(comment_url, headers=Config.headers, cookies=Config.cookies)
time.sleep(3)
response.encoding = response.apparent_encoding
print("获取评论页格式:",response.encoding)
response.raise_for_status()
if response.encoding == None:
return "获取失败"
page = response.content.decode(response.encoding) # type(page)为str,fetchJSON_comment98(dic),dic['productCommentSummary']['commentCountStr']为评论数
# "commentCountStr":"2万+", 获取其中的2万+,暂时想到的办法是用split和replace对字符串进行切分后再替换不需要的字符
comment_count = page.split("commentCountStr")[1].split(':')[1].split(',')[0].replace('"', '')
response.close() # 如果不及时关闭,会报错
return comment_count
def add_product_parameter(self, need_list, url):
html = get_html('https:' + url)
# print("商品详情信息:", html)
soup = BeautifulSoup(html, 'html.parser')
time.sleep(3)
# 获取商品参数
parameters = soup.find('ul', class_='parameter2 p-parameter-list')
if parameters == None:
# 如果这里报错,并显示解析得到的网页信息为<title>京东安全</title>
# 那就是京东检测到爬虫启动了安全机制,把代码中time.sleep数再增大
print("------------------")
print(html)
print("------------------")
para_lists = parameters.find_all('li')
name_lists = []
para_text_lists = []
for para in para_lists:
para_text = para.get_text()
# print(para_text)
# para_text的形式:“商品名称:浩雅HY160”
name_lists.append(para_text.split(":")[0]) # name_lists保存单个商品详情页参数名称,如“商品名称”
para_text_lists.append(para_text.split(":")[1]) # para_text_lists保存参数名称对应的参数,如”浩雅HY160“
return_list = []
# 按need_list中参数名称的顺序保存单个商品详情页中的爬虫数据
for need in need_list[:-1]: # 评论数单独拎出来,need_list[-1]为“评论数”
if need in name_lists:
index = name_lists.index(need)
return_list.append(para_text_lists[index])
else:
return_list.append(' ')
# 最后一列填充评论数
return_list.append(self.acquire_comment(url))
return return_list
def find_attr(self, attr):
try:
if attr == Config.GOOD_LABEL[0]:
# 商品名称
result = self.li_info.find(class_='p-name p-name-type-2').find('em').get_text()
elif attr == Config.GOOD_LABEL[1]:
# 价格
result = self.li_info.find(class_='p-price').find('i').get_text()
elif attr == Config.GOOD_LABEL[2]:
# 商家
result = self.li_info.find(class_='p-shop').find('a').get_text()
elif attr == Config.GOOD_LABEL[4]:
# 图片
temp = self.li_info.find(class_='p-img').find('a').find('img').get('data-lazy-img')
print("图片原地址:",temp)
result = "./cups/" + str(excel._current_row) + ".jpg"
imageurl = "http:" + temp
try:
urllib.request.urlretrieve(imageurl,filename=result)
except:
result = "保存失败"
elif attr == Config.GOOD_LABEL[3]:
try:
# 商品详情地址
result = self.li_info.find(class_='p-name p-name-type-2').find('a')['href']
print("商品详情信息:",result)
# 进入单个商品详情网页进行数据爬取,本代码所说的单个商品详情网页意思为从搜索页点进某一个商品页,比如https://item.jd.com/100007046969.html
paras = self.add_product_parameter(Config.PARAMETER_LABEL, result)
for i in range(len(paras)):
para = paras[i]
self.good_info_dic.setdefault(Config.PARAMETER_LABEL[i], para)
except Exception as e:
print("\033[42m Error:{} \033[0m".format(e))
except AttributeError:
result = '无'
self.good_info_dic.setdefault(attr, result) # 集合setdefault
# 添加商品信息
def add_good_info(self):
for item in Config.GOOD_LABEL:
self.find_attr(item)
# 获取产品列表
def get_good(self):
return self.good_info_dic
def get_html(url, currentPage=None, pageSize=None):
if pageSize:
print("--> 正在获取网站第 " + str(currentPage) + "页信息")
if currentPage != 1:
url = url + '&page=' + str(currentPage) + '&s=' + str(pageSize) + '&click=0'
response = requests.get(url, headers=Config.headers, cookies=Config.cookies, timeout=30) # 请求访问网站
response.encoding = response.apparent_encoding
response.raise_for_status()
time.sleep(3)
if response.status_code == 200:
html = response.text # 获取网页源码
response.close()
return html # 返回网页源码
else:
response.close()
print("获取网站信息失败!")
if __name__ == '__main__':
'''
一定需要修改的是Config中的headers文件,每个电脑每个京东账号对应的文件不同
修改完headers后可以输入关键词为“家用水杯办公”,页数“2”,查看下爬虫结果
可根据自己的需求修改Config中的参数
'''
# 创建文件
excel = Excel()
config = Config()
# 搜索地址
search_url = 'https://search.jd.com/Search?keyword=' + config.keyword + '&enc=utf-8&psort=3'
page = {
'total': 0, # 总页数
'currentPage': 1, # 当前页数
'pageSize': 0 # 每页显示多少条
}
if not config.total.isdigit():
print("非法字符,程序退出!")
exit(0)
page['total'] = eval(config.total) - page['currentPage']
for i in tqdm(range(page['total'])):
# 初始化BeautifulSoup库,并设置解析器
soup = BeautifulSoup(get_html(search_url, page['currentPage'], page['currentPage'] * page['pageSize']), 'lxml')
time.sleep(3) # 为了防止爬取太快被京东服务器拦截,在每次解析网页操作后强制休息2秒
# 商品列表
goods_list = soup.find_all('li', class_='gl-item')
print("分析到第" + str(page['currentPage']) + '页共有' + str(len(goods_list)) + '条商品信息')
for li in goods_list: # 遍历父节点
# 有问题的时候把这里的try给注释掉
try:
print("----------------------------------------------------------------------")
goods = Goods(li)
# 添加信息
goods.add_good_info()
# 获取信息
good_info = goods.get_good()
# 写入excel
excel.write_content(good_info)
except:
print("商品信息获取失败")
break
page['currentPage'] = page['currentPage'] + 1
page['pageSize'] = len(goods_list) * page['currentPage']


数据分析代码
使用jupyter编写,可视化库使用pyecharts,官方网址:https://pyecharts.org/#/zh-cn/intro
from pyecharts.charts import Bar, Pie
import pandas as pd
import pyecharts.options as opts
from pyecharts.globals import ThemeType
import matplotlib.pyplot as plt
from PIL import Image
- 对水杯材质可视化
df = pd.read_excel("./cup.xls")
# 有些商品挂了多个链接,先筛掉重复地址的商品
df.drop_duplicates(subset=['商品详情地址'], keep='first', inplace=True)
# 筛掉销售量为0、获取失败以及未注明材质的商品
filtered_df = df[(df['评论数'] != "0") & (df['评论数'] != "获取失败") & (df['材质'] != ' ')]
# 得到材质所有类别名称
mat_classes = filtered_df['材质'].unique().tolist()
mat_counts = [0 for _ in range(len(mat_classes))]
# 计算每种材质总的销售额
num_rows = len(filtered_df)
for i in range(num_rows):
row = filtered_df.iloc[i]
index = mat_classes.index(row['材质'])
count = row['评论数'].split('+')[0].split('万')
if len(count) == 2:
mat_counts[index] += int(count[0])*10000
elif len(count) == 1:
mat_counts[index] += int(count[0])
# 按从大到小排序
sorted_tuples = sorted(zip(mat_classes, mat_counts), key=lambda x: x[1], reverse=True)
sorted_mat_classes = [x[0] for x in sorted_tuples]
sorted_mat_counts = [x[1] for x in sorted_tuples]
# 绘制直方图
bar = (
Bar()
.add_xaxis(sorted_mat_classes)
.add_yaxis("销售额", sorted_mat_counts)
.set_global_opts(title_opts=opts.TitleOpts(title="材质"))#, subtitle="副标题"
)
bar.set_global_opts(
xaxis_opts={
'axisLabel': {
'rotate': 45
}
}
)
# jupyter notebook直接调用render_notebook随时随地渲染图表
bar.render_notebook()
# bar.render('F:/vsCode/everyThing/material.html')

# 绘制饼图
pie_chart = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.SHINE))
.add("", [list(z) for z in zip(sorted_mat_classes, sorted_mat_counts)])
# .set_global_opts(title_opts=opts.TitleOpts(title="材质"))
.set_series_opts(
label_opts=opts.LabelOpts(
formatter="{b}: {d}%",
font_size=12,
position="outside",
),
# 调整饼图大小
radius=["0%", "80%"],
)
)
pie_chart.render_notebook()

- 对容量可视化
# 筛掉销售量为0和获取失败、未注明容量、容量不明确的商品
filtered_df = df[(df['评论数'] != "0")
& (df['评论数'] != "获取失败")
& (df['容量'] != ' ')
& (df['容量'] != '≤50mL,51-100mL,101-200mL,201-300mL,301-400mL,401-500mL,501-600mL,601-700mL,701-800mL,801-900mL,901-1000mL,1L以上')
& (df['容量'] != '其他')
& (df['容量'] != '其它')
& (df['容量'] != '201-300mL,301-400mL')
& (df['容量'] != '201-300mL,301-400mL,401-500mL')
& (df['容量'] != '201-300mL,801-900mL')
& (df['容量'] != '301-400mL,401-500mL,501-600mL')]
vol_classes = filtered_df['容量'].unique().tolist()
vol_counts = [0 for _ in range(len(vol_classes))]
num_rows = len(filtered_df)
for i in range(num_rows):
row = filtered_df.iloc[i]
index = vol_classes.index(row['容量'])
count = row['评论数'].split('+')[0].split('万')
if len(count) == 2:
vol_counts[index] += int(count[0])*10000
elif len(count) == 1:
vol_counts[index] += int(count[0])
# 合并重复的
index1 = vol_classes.index('101-200mL')
index2 = vol_classes.index('101-200ml')
temp = vol_counts[index1] + vol_counts[index2]
vol_counts[index1] = temp
del vol_classes[index2]
del vol_counts[index2]
index1 = vol_classes.index('401-500mL')
index2 = vol_classes.index('401-500ml')
temp = vol_counts[index1] + vol_counts[index2]
vol_counts[index1] = temp
del vol_classes[index2]
del vol_counts[index2]
sorted_tuples = sorted(zip(vol_classes, vol_counts), key=lambda x: x[1], reverse=True)
sorted_vol_classes = [x[0] for x in sorted_tuples]
sorted_vol_counts = [x[1] for x in sorted_tuples]
bar = (
Bar()
.add_xaxis(sorted_vol_classes)
.add_yaxis("销售额", sorted_vol_counts)
.set_global_opts(title_opts=opts.TitleOpts(title="容量"))#, subtitle="副标题"
)
bar.set_global_opts(
xaxis_opts={
'axisLabel': {
'rotate': 45
}
}
)
bar.render_notebook()
# bar.render('F:/vsCode/everyThing/volume.html')

pie_chart = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.SHINE))
.add("", [list(z) for z in zip(sorted_vol_classes, sorted_vol_counts)])
# .set_global_opts(title_opts=opts.TitleOpts(title="容量"))
.set_series_opts(
label_opts=opts.LabelOpts(
formatter="{b}: {d}%",
font_size=12,
position="outside",
),
# 调整饼图大小
radius=["0%", "60%"],
)
)
pie_chart.render_notebook()

- 销售额前60的杯子
# 筛掉销售量为0和获取失败
filtered_df = df[(df['评论数'] != "0") & (df['评论数'] != "获取失败")]
sales = []
indexs = []
num_rows = len(filtered_df)
for i in range(num_rows):
row = filtered_df.iloc[i]
count = row['评论数'].split('+')[0].split('万')
indexs.append(i)
if len(count) == 2:
sales.append(int(count[0])*10000)
elif len(count) == 1:
sales.append(int(count[0]))
# 使用sorted函数对列表进行排序,并取最后销售额前几的元素
sorted_tuples = sorted(zip(indexs, sales), key=lambda x: x[1], reverse=True)
sorted_indexs = [x[0] for x in sorted_tuples][40:60] # 0:20为前20,20:40为前20~40
sorted_sales = [x[1] for x in sorted_tuples][40:60]
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文显示
plt.figure(figsize=(25, 20))
for i in range(len(sorted_indexs)):
index = sorted_indexs[i]
row = filtered_df.iloc[index]
imgPath = "." + row['图片'][1:]
imgNum = row['图片'].split('/')[-1]
img = Image.open(imgPath)
count = row['评论数']
sale = sorted_sales[i]
plt.subplot(4,5,i+1)
plt.imshow(img)
plt.title(imgNum + " 销售额:"+count)
plt.savefig("./03.jpg", bbox_inches='tight')
plt.show()


