Python_爬取JD商品信息

header的生成

主流网站都限制了爬虫,一般是识别header信息来判断真人或者爬虫,直接通过request获取商品页面会被重定向到登陆界面,配置了header可正常获取信息
对header不熟悉的可通过工具来生成header

https://curl.trillworks.com/

  • 首先获取爬取页面的url,通过开发者模式获取curl

  • 只保留params和headers,将post改成get,最后输出

re+requests爬取JD商品信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import re

def getHtmlText():

headers = {
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://search.jd.com/',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
'Origin': 'https://search.jd.com',
'If-Modified-Since': 'Thu, 14 May 2020 11:33:47 GMT',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Pragma': 'no-cache',
}

params = (
('keyword', '思想本质'),
('enc', 'utf-8^'),
('suggest', '4.his.0.0^'),
('www', '^'),
('pvid', '60fd02742beb4eb78dc4f23c90b7fd31'),
)
try:
response = requests.get('https://search.jd.com/Search', headers=headers, params=params)
response.encoding = response.apparent_encoding
response.raise_for_status()
return response.text
except:
print(response.raise_for_status())

def parsePage(flist,html):
try:
plt = re.findall(r'<em>\¥<\/em><i>[\d.]*<\/i>',html)
nlt = re.findall(r'<i class=\"promo-words\" id=\"J_AD_[\d]*\">\S*<\/i>',html)
blt = re.findall(r'<span class=\"p-bi-name\" onclick=.*>\s*<a title=\".*\" href=.* target=\".*\">.*<\/a>', html)
for i in range(len(plt)):
try:
price = re.split('[><]',plt[i])
bname = re.split('[><]',nlt[i])
author = re.split('[><]',blt[i])
except:
pass
flist.append([price[6],bname[2],author[4]])
except:
pass

def printGoodsList(flist):
tflist = "{:4}\t{:<10}\t{:^0}"
print(tflist.format("价格","简介","作者"))
for g in flist:
print(tflist.format(g[0],g[1],g[2]))
def main():
flist = []
html = getHtmlText()
plist = parsePage(flist,html)
printGoodsList(flist)

if __name__ == '__main__':
main()

BeautifulSoup

BeautifulSoup在标签树清晰的页面上更好用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
import bs4

def getHtmlText():

headers = {
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://search.jd.com/',
'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
'Origin': 'https://search.jd.com',
'If-Modified-Since': 'Thu, 14 May 2020 11:33:47 GMT',
'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
'Pragma': 'no-cache',
}

params = (
('keyword', '思想本质'),
('enc', 'utf-8^'),
('suggest', '4.his.0.0^'),
('www', '^'),
('pvid', '60fd02742beb4eb78dc4f23c90b7fd31'),
)
try:
response = requests.get('https://search.jd.com/Search', headers=headers, params=params)
response.encoding = response.apparent_encoding
response.raise_for_status()
return response.text
except:
print(response.raise_for_status())

def parsePage(flist,html):
soup = bs4.BeautifulSoup(html,"html.parser")
ulsoup = soup.find("ul",attrs={"gl-warp clearfix"})
price = ulsoup.find_all('div', attrs={"p-price"})
pname = ulsoup.find_all('div', attrs={"p-name"})
pbookdetails = ulsoup.find_all('div', attrs={"p-bookdetails"})
pshopnum = ulsoup.find_all('div', attrs={"p-shopnum"})
for i in range(len(price)-1):
flist.append([price[i].get_text(strip=True), pname[i].get_text(strip=True), pbookdetails[i].get_text(strip=True), pshopnum[i].get_text(strip=True)])
def printGoodsList(flist):
tflist = "{:4}\t{:<}\t{:^}\t{:>}"
print(tflist.format("价格","简介","作者","出版社"))
for g in flist:
print(tflist.format(g[0],g[1],g[2],g[3]))
def main():
flist = []
html = getHtmlText()
plist = parsePage(flist,html)
printGoodsList(flist)

if __name__ == '__main__':
main()

推荐https://www.w3cschool.cn/python/


Python_爬取JD商品信息
https://imwang77.github.io/2020/11/25/Python_爬取JD商品信息/
作者
imwang77
发布于
2020年11月25日
更新于
2021年5月28日
许可协议