Python_爬取JD商品信息

主流网站都限制了爬虫，一般是识别header信息来判断真人或者爬虫，直接通过request获取商品页面会被重定向到登陆界面，配置了header可正常获取信息
对header不熟悉的可通过工具来生成header

https://curl.trillworks.com/

首先获取爬取页面的url,通过开发者模式获取curl

复制到https://curl.trillworks.com/生成python的requests语句

只保留params和headers,将post改成get，最后输出

re+requests爬取JD商品信息

import requests
import  re

def getHtmlText():

    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
        'Accept': '*/*',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://search.jd.com/',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
        'Origin': 'https://search.jd.com',
        'If-Modified-Since': 'Thu, 14 May 2020 11:33:47 GMT',
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
        'Pragma': 'no-cache',
    }

    params = (
        ('keyword', '思想本质'),
        ('enc', 'utf-8^'),
        ('suggest', '4.his.0.0^'),
        ('www', '^'),
        ('pvid', '60fd02742beb4eb78dc4f23c90b7fd31'),
    )
    try:
        response = requests.get('https://search.jd.com/Search', headers=headers, params=params)
        response.encoding = response.apparent_encoding
        response.raise_for_status()
        return  response.text
    except:
        print(response.raise_for_status())

def parsePage(flist,html):
    try:
        plt = re.findall(r'<em>\￥<\/em><i>[\d.]*<\/i>',html)
        nlt = re.findall(r'<i class=\"promo-words\" id=\"J_AD_[\d]*\">\S*<\/i>',html)
        blt = re.findall(r'<span class=\"p-bi-name\" onclick=.*>\s*<a title=\".*\" href=.* target=\".*\">.*<\/a>', html)
        for i in range(len(plt)):
            try:
                price = re.split('[><]',plt[i])
                bname = re.split('[><]',nlt[i])
                author = re.split('[><]',blt[i])
            except:
                pass
            flist.append([price[6],bname[2],author[4]])
    except:
        pass

def printGoodsList(flist):
    tflist = "{:4}\t{:<10}\t{:^0}"
    print(tflist.format("价格","简介","作者"))
    for g in flist:
        print(tflist.format(g[0],g[1],g[2]))
def main():
    flist = []
    html = getHtmlText()
    plist = parsePage(flist,html)
    printGoodsList(flist)

if __name__ == '__main__':
    main()

BeautifulSoup

BeautifulSoup在标签树清晰的页面上更好用

import requests
import  bs4

def getHtmlText():

    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
        'Accept': '*/*',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://search.jd.com/',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6',
        'Origin': 'https://search.jd.com',
        'If-Modified-Since': 'Thu, 14 May 2020 11:33:47 GMT',
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
        'Pragma': 'no-cache',
    }

    params = (
        ('keyword', '思想本质'),
        ('enc', 'utf-8^'),
        ('suggest', '4.his.0.0^'),
        ('www', '^'),
        ('pvid', '60fd02742beb4eb78dc4f23c90b7fd31'),
    )
    try:
        response = requests.get('https://search.jd.com/Search', headers=headers, params=params)
        response.encoding = response.apparent_encoding
        response.raise_for_status()
        return  response.text
    except:
        print(response.raise_for_status())

def parsePage(flist,html):
    soup = bs4.BeautifulSoup(html,"html.parser")
    ulsoup =  soup.find("ul",attrs={"gl-warp clearfix"})
    price = ulsoup.find_all('div', attrs={"p-price"})
    pname = ulsoup.find_all('div', attrs={"p-name"})
    pbookdetails = ulsoup.find_all('div', attrs={"p-bookdetails"})
    pshopnum = ulsoup.find_all('div', attrs={"p-shopnum"})
    for i in range(len(price)-1):
        flist.append([price[i].get_text(strip=True), pname[i].get_text(strip=True), pbookdetails[i].get_text(strip=True), pshopnum[i].get_text(strip=True)])
def printGoodsList(flist):
    tflist = "{:4}\t{:<}\t{:^}\t{:>}"
    print(tflist.format("价格","简介","作者","出版社"))
    for g in flist:
        print(tflist.format(g[0],g[1],g[2],g[3]))
def main():
    flist = []
    html = getHtmlText()
    plist = parsePage(flist,html)
    printGoodsList(flist)

if __name__ == '__main__':
    main()

推荐https://www.w3cschool.cn/python/

Python

#Python #crawler

Python_爬取JD商品信息

https://imwang77.github.io/2020/11/25/Python_爬取JD商品信息/

作者

imwang77

发布于

2020年11月25日

更新于

2021年5月28日

许可协议

Other_隐私短信上一篇

LINUX_获取分析内存信息下一篇

Python_爬取JD商品信息

header的生成

https://curl.trillworks.com/

re+requests爬取JD商品信息

BeautifulSoup

推荐https://www.w3cschool.cn/python/