python 爬虫代码

一、代码

1、爬虫_urllib_基本使用


# 使用urllib来获取百度首页的源码
import urllib.request


# (1)定义一个url  就是你要访问的地址
url = 'http://www.baidu.com'

# (2)模拟浏览器向服务器发送请求 response响应
response = urllib.request.urlopen(url)

# (3)获取响应中的页面的源码  content 内容的意思
# read方法  返回的是字节形式的二进制数据
# 我们要将二进制的数据转换为字符串
# 二进制--》字符串  解码  decode('编码的格式')
content = response.read().decode('utf-8')

# (4)打印数据
print(content)

2、爬虫_urllib_1个类型和6个方法

import urllib.request

url = 'http://www.baidu.com'

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(url)

# 一个类型和六个方法
# response是HTTPResponse的类型
# print(type(response))

# 按照一个字节一个字节的去读
# content = response.read()
# print(content)

# 返回多少个字节
# content = response.read(5)
# print(content)

# 读取一行
# content = response.readline()
# print(content)

# content = response.readlines()
# print(content)

# 返回状态码  如果是200了 那么就证明我们的逻辑没有错
# print(response.getcode())

# 返回的是url地址
# print(response.geturl())

# 获取是一个状态信息
print(response.getheaders())

# 一个类型 HTTPResponse
# 六个方法 read  readline  readlines  getcode geturl getheaders


3、爬虫_urllib_下载

import urllib.request

# 下载网页
# url_page = 'http://www.baidu.com'

# url代表的是下载的路径  filename文件的名字
# 在python中 可以变量的名字  也可以直接写值
# urllib.request.urlretrieve(url_page,'baidu.html')

# 下载图片
# url_img = 'https://img1.baidu.com/it/u=3004965690,4089234593&fm=26&fmt=auto&gp=0.jpg'
#
# urllib.request.urlretrieve(url= url_img,filename='lisa.jpg')

# 下载视频
url_video = 'https://vd3.bdstatic.com/mda-mhkku4ndaka5etk3/1080p/cae_h264/1629557146541497769/mda-mhkku4ndaka5etk3.mp4?v_from_s=hkapp-haokan-tucheng&auth_key=1629687514-0-0-7ed57ed7d1168bb1f06d18a4ea214300&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest='

urllib.request.urlretrieve(url_video,'hxekyyds.mp4')

4、爬虫_urllib_请求对象的定制



import urllib.request

url = 'https://www.baidu.com'

# url的组成
# https://www.baidu.com/s?wd=周杰伦

# http/https    www.baidu.com   80/443     s      wd = 周杰伦     #
#    协议             主机        端口号     路径     参数           锚点
# http   80
# https  443
# mysql  3306
# oracle 1521
# redis  6379
# mongodb 27017

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 因为urlopen方法中不能存储字典 所以headers不能传递进去
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf8')

print(content)


5、爬虫_urllib_get请求的quote方法


# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6


# 需求 获取 https://www.baidu.com/s?wd=周杰伦的网页源码

import urllib.request
import urllib.parse


url = 'https://www.baidu.com/s?wd='

# 请求对象的定制为了解决反爬的第一种手段
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 将周杰伦三个字变成unicode编码的格式
# 我们需要依赖于urllib.parse
name = urllib.parse.quote('周杰伦')

url = url + name

# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取响应的内容
content = response.read().decode('utf-8')

# 打印数据
print(content)

6、爬虫_urllib_get请求的urlencode方法


# urlencode应用场景:多个参数的时候


# https://www.baidu.com/s?wd=周杰伦&sex=男

# import urllib.parse
#
# data = {
#     'wd':'周杰伦',
#     'sex':'男',
#     'location':'中国台湾省'
# }
#
# a = urllib.parse.urlencode(data)
# print(a)


#获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7的网页源码

import urllib.request
import urllib.parse

base_url = 'https://www.baidu.com/s?'

data = {
    'wd':'周杰伦',
    'sex':'男',
    'location':'中国台湾省'
}

new_data = urllib.parse.urlencode(data)

# 请求资源路径
url = base_url + new_data

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取网页源码的数据
content = response.read().decode('utf-8')

# 打印数据
print(content)

7、爬虫_urllib_post请求百度翻译



# post请求

import urllib.request
import urllib.parse


url = 'https://fanyi.baidu.com/sug'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'kw':'spider'
}

# post请求的参数 必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')

# post的请求的参数 是不会拼接在url的后面的  而是需要放在请求对象定制的参数中
# post请求的参数 必须要进行编码
request = urllib.request.Request(url=url,data=data,headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取响应的数据
content = response.read().decode('utf-8')

# 字符串--》json对象

import json

obj = json.loads(content)
print(obj)



# post请求方式的参数 必须编码   data = urllib.parse.urlencode(data)
# 编码之后 必须调用encode方法 data = urllib.parse.urlencode(data).encode('utf-8')
# 参数是放在请求对象定制的方法中  request = urllib.request.Request(url=url,data=data,headers=headers)


8、爬虫_urllib_post请求百度翻译之详细翻译



import urllib.request
import urllib.parse

url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'

headers = {
    # 'Accept': '*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    # 'Accept-Language': 'zh-CN,zh;q=0.9',
    # 'Connection': 'keep-alive',
    # 'Content-Length': '135',
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=DAA8F9F0BD801A2929D96D69CF7EBF50; PSTM=1597202227; BAIDUID=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; __yjs_duid=1_c19765bd685fa6fa12c2853fc392f8db1618999058029; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDUSS_BFESS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=34435_31660_34405_34004_34073_34092_26350_34426_34323_22158_34390; delPer=1; BA_HECTOR=8185a12020018421b61gi6ka20q; BCLID=10943521300863382545; BDSFRCVID=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; BCLID_BFESS=10943521300863382545; BDSFRCVID_BFESS=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629701482,1629702031,1629702343,1629704515; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629704515; __yjs_st=2_MDBkZDdkNzg4YzYyZGU2NTM5NzBjZmQ0OTZiMWRmZGUxM2QwYzkwZTc2NTZmMmIxNDJkYzk4NzU1ZDUzN2U3Yjc4ZTJmYjE1YTUzMTljYWFkMWUwYmVmZGEzNmZjN2FlY2M3NDAzOThhZTY5NzI0MjVkMmQ0NWU3MWE1YTJmNGE5NDBhYjVlOWY3MTFiMWNjYTVhYWI0YThlMDVjODBkNWU2NjMwMzY2MjFhZDNkMzVhNGMzMGZkMWY2NjU5YzkxMDk3NTEzODJiZWUyMjEyYTk5YzY4ODUyYzNjZTJjMGM5MzhhMWE5YjU3NTM3NWZiOWQxNmU3MDVkODExYzFjN183XzliY2RhYjgz; ab_sr=1.0.1_ZTc2ZDFkMTU5ZTM0ZTM4MWVlNDU2MGEzYTM4MzZiY2I2MDIxNzY1Nzc1OWZjZGNiZWRhYjU5ZjYwZmNjMTE2ZjIzNmQxMTdiMzIzYTgzZjVjMTY0ZjM1YjMwZTdjMjhiNDRmN2QzMjMwNWRhZmUxYTJjZjZhNTViMGM2ODFlYjE5YTlmMWRjZDAwZGFmMDY4ZTFlNGJiZjU5YzE1MGIxN2FiYTU3NDgzZmI4MDdhMDM5NTQ0MjQxNDBiNzdhMDdl',
    # 'Host': 'fanyi.baidu.com',
    # 'Origin': 'https://fanyi.baidu.com',
    # 'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
    # 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
    # 'sec-ch-ua-mobile': '?0',
    # 'Sec-Fetch-Dest': 'empty',
    # 'Sec-Fetch-Mode': 'cors',
    # 'Sec-Fetch-Site': 'same-origin',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    # 'X-Requested-With': 'XMLHttpRequest',
}

data = {
    'from': 'en',
    'to': 'zh',
    'query': 'love',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '198772.518981',
    'token': '......',
    'domain': 'common',
}
# post请求的参数  必须进行编码 并且要调用encode方法
data = urllib.parse.urlencode(data).encode('utf-8')

# 请求对象的定制
request = urllib.request.Request(url = url,data = data,headers = headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取响应的数据
content = response.read().decode('utf-8')

import json

obj = json.loads(content)
print(obj)

9、爬虫_urllib_ajax的get请求豆瓣电影第一页


# get请求
# 获取豆瓣电影的第一页的数据 并且保存起来

import urllib.request

url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# (1) 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

# (2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

# (3) 数据下载到本地
# open方法默认情况下使用的是gbk的编码  如果我们要想保存汉字 那么需要在open方法中指定编码格式为utf-8
# encoding = 'utf-8'
# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)

with open('douban1.json','w',encoding='utf-8') as fp:
    fp.write(content)

10、爬虫_urllib_ajax的get请求豆瓣电影前10页



# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=0&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=20&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=40&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=60&limit=20

# page    1  2   3   4
# start   0  20  40  60

# start (page - 1)*20


# 下载豆瓣电影前10页的数据
# (1) 请求对象的定制
# (2) 获取响应的数据
# (3) 下载数据

import urllib.parse
import urllib.request

def create_request(page):
    base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'

    data = {
        'start':(page - 1) * 20,
        'limit':20
    }

    data = urllib.parse.urlencode(data)

    url = base_url + data

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
    }

    request = urllib.request.Request(url=url,headers=headers)
    return request


def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content


def down_load(page,content):
    with open('douban_' + str(page) + '.json','w',encoding='utf-8')as fp:
        fp.write(content)




# 程序的入口
if __name__ == '__main__':
    start_page = int(input('请输入起始的页码'))
    end_page = int(input('请输入结束的页面'))

    for page in range(start_page,end_page+1):
#         每一页都有自己的请求对象的定制
        request = create_request(page)
#         获取响应的数据
        content = get_content(request)
#         下载
        down_load(page,content)




11、爬虫_urllib_ajax的post请求肯德基官网

# 1页
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
# post
# cname: 北京
# pid:
# pageIndex: 1
# pageSize: 10


# 2页
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
# post
# cname: 北京
# pid:
# pageIndex: 2
# pageSize: 10

import urllib.request
import urllib.parse

def create_request(page):
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

    data = {
        'cname': '北京',
        'pid':'',
        'pageIndex': page,
        'pageSize': '10'
    }

    data = urllib.parse.urlencode(data).encode('utf-8')

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
    }

    request = urllib.request.Request(url=base_url,headers=headers,data=data)

    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content


def down_load(page,content):
    with open('kfc_' + str(page) + '.json','w',encoding='utf-8')as fp:
        fp.write(content)



if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))

    for page in range(start_page,end_page+1):
        # 请求对象的定制
        request = create_request(page)
        # 获取网页源码
        content = get_content(request)
        # 下载
        down_load(page,content)

12、爬虫_urllib_异常


import urllib.request
import urllib.error

# url = 'https://blog.csdn.net/sulixu/article/details/1198189491'

url = 'http://www.doudan1111.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

try:
    request = urllib.request.Request(url = url, headers = headers)

    response = urllib.request.urlopen(request)

    content = response.read().decode('utf-8')

    print(content)
except urllib.error.HTTPError:
    print('系统正在升级。。。')
except urllib.error.URLError:
    print('我都说了 系统正在升级。。。')

13、爬虫_urllib_微博的cookie登陆

# 适用的场景:数据采集的时候 需要绕过登陆 然后进入到某个页面
# 个人信息页面是utf-8  但是还报错了编码错误  因为并没有进入到个人信息页面 而是跳转到了登陆页面
# 那么登陆页面不是utf-8  所以报错

# 什么情况下访问不成功?
# 因为请求头的信息不够  所以访问不成功

import urllib.request

url = 'https://weibo.cn/6451491586/info'

headers = {
# ':authority': 'weibo.cn',
# ':method': 'GET',
# ':path': '/6451491586/info',
# ':scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
#     cookie中携带着你的登陆信息   如果有登陆之后的cookie  那么我们就可以携带着cookie进入到任何页面
'cookie': '_T_WM=24c44910ba98d188fced94ba0da5960e; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFxxfgNNUmXi4YiaYZKr_J_5NHD95QcSh-pSh.pSKncWs4DqcjiqgSXIgvVPcpD; SUB=_2A25MKKG_DeRhGeBK7lMV-S_JwzqIHXVv0s_3rDV6PUJbktCOLXL2kW1NR6e0UHkCGcyvxTYyKB2OV9aloJJ7mUNz; SSOLoginState=1630327279',
# referer  判断当前路径是不是由上一个路径进来的    一般情况下 是做图片防盗链
'referer': 'https://weibo.cn/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
# 获取响应的数据
content = response.read().decode('utf-8')

# 将数据保存到本地
with open('weibo.html','w',encoding='utf-8')as fp:
    fp.write(content)

14、爬虫_urllib_handler处理器的基本使用



# 需求 使用handler来访问百度  获取网页源码

import urllib.request

url = 'http://www.baidu.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

request = urllib.request.Request(url = url,headers = headers)

# handler   build_opener  open

# (1)获取hanlder对象
handler = urllib.request.HTTPHandler()

# (2)获取opener对象
opener = urllib.request.build_opener(handler)

# (3) 调用open方法
response = opener.open(request)

content = response.read().decode('utf-8')

print(content)

15、爬虫_urllib_代理

import urllib.request

url = 'http://www.baidu.com/s?wd=ip'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url = url,headers= headers)

# 模拟浏览器访问服务器
# response = urllib.request.urlopen(request)

proxies = {
    'http':'118.24.219.151:16817'
}
# handler  build_opener  open
handler = urllib.request.ProxyHandler(proxies = proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

# 获取响应的信息
content = response.read().decode('utf-8')

# 保存
with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)


16、爬虫_urllib_代理池

import urllib.request

proxies_pool = [
    {'http':'118.24.219.151:16817'},
    {'http':'118.24.219.151:16817'},
]

import random

proxies = random.choice(proxies_pool)

url = 'http://www.baidu.com/s?wd=ip'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

request = urllib.request.Request(url = url,headers=headers)

handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

17、爬虫_解析_xpath的基本使用

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8"/>
    <title>Title</title>
</head>
<body>
    <ul>
        <li id="l1" class="c1">北京</li>
        <li id="l2">上海</li>
        <li id="c3">深圳</li>
        <li id="c4">武汉</li>
    </ul>

<!--    <ul>-->
<!--        <li>大连</li>-->
<!--        <li>锦州</li>-->
<!--        <li>沈阳</li>-->
<!--    </ul>-->
</body>
</html>

18、爬虫_解析_xpath的基本使用

from lxml import etree

# xpath解析
# (1)本地文件                                                etree.parse
# (2)服务器响应的数据  response.read().decode('utf-8') *****   etree.HTML()

# xpath解析本地文件
tree = etree.parse('爬虫_解析_xpath的基本使用.html')

#tree.xpath('xpath路径')

# 查找ul下面的li
# li_list = tree.xpath('//body/ul/li')


# 查找所有有id的属性的li标签
# text()获取标签中的内容
# li_list = tree.xpath('//ul/li[@id]/text()')

# 找到id为l1的li标签  注意引号的问题
# li_list = tree.xpath('//ul/li[@id="l1"]/text()')

# 查找到id为l1的li标签的class的属性值
# li = tree.xpath('//ul/li[@id="l1"]/@class')

# 查询id中包含l的li标签
# li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')

# 查询id的值以l开头的li标签
# li_list = tree.xpath('//ul/li[starts-with(@id,"c")]/text()')

#查询id为l1和class为c1的
# li_list = tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()')

li_list = tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="l2"]/text()')

# 判断列表的长度
print(li_list)
print(len(li_list))


19、爬虫_解析_获取百度网站的百度一下



# (1) 获取网页的源码
# (2) 解析   解析的服务器响应的文件  etree.HTML
# (3)  打印

import urllib.request

url = 'https://www.baidu.com/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url = url,headers = headers)

# 模拟浏览器访问服务器
response = urllib.request.urlopen(request)

# 获取网页源码
content = response.read().decode('utf-8')

# 解析网页源码 来获取我们想要的数据
from lxml import etree

# 解析服务器响应的文件
tree = etree.HTML(content)

# 获取想要的数据  xpath的返回值是一个列表类型的数据
result = tree.xpath('//input[@id="su"]/@value')[0]

print(result)

20、爬虫_解析_站长素材



# (1) 请求对象的定制
# (2)获取网页的源码
# (3)下载


# 需求 下载的前十页的图片
# https://sc.chinaz.com/tupian/qinglvtupian.html   1
# https://sc.chinaz.com/tupian/qinglvtupian_page.html

import urllib.request
from lxml import etree

def create_request(page):
    if(page == 1):
        url = 'https://sc.chinaz.com/tupian/qinglvtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    }

    request = urllib.request.Request(url = url, headers = headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content


def down_load(content):
#     下载图片
    # urllib.request.urlretrieve('图片地址','文件的名字')
    tree = etree.HTML(content)

    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')

    # 一般设计图片的网站都会进行懒加载
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')

    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = 'https:' + src

        urllib.request.urlretrieve(url=url,filename='./loveImg/' + name + '.jpg')




if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))

    for page in range(start_page,end_page+1):
        # (1) 请求对象的定制
        request = create_request(page)
        # (2)获取网页的源码
        content = get_content(request)
        # (3)下载
        down_load(content)

21、爬虫_解析_jsonpath

{ "store": {
    "book": [
      { "category": "修真",
        "author": "六道",
        "title": "坏蛋是怎样练成的",
        "price": 8.95
      },
      { "category": "修真",
        "author": "天蚕土豆",
        "title": "斗破苍穹",
        "price": 12.99
      },
      { "category": "修真",
        "author": "唐家三少",
        "title": "斗罗大陆",
        "isbn": "0-553-21311-3",
        "price": 8.99
      },
      { "category": "修真",
        "author": "南派三叔",
        "title": "星辰变",
        "isbn": "0-395-19395-8",
        "price": 22.99
      }
    ],
    "bicycle": {
      "author": "老马",
      "color": "黑色",
      "price": 19.95
    }
  }
}

22、爬虫_解析_jsonpath


import json
import jsonpath


obj = json.load(open('爬虫_解析_jsonpath.json','r',encoding='utf-8'))

# 书店所有书的作者
# author_list = jsonpath.jsonpath(obj,'$.store.book[*].author')
# print(author_list)

# 所有的作者
# author_list = jsonpath.jsonpath(obj,'$..author')
# print(author_list)

# store下面的所有的元素
# tag_list = jsonpath.jsonpath(obj,'$.store.*')
# print(tag_list)

# store里面所有东西的price
# price_list = jsonpath.jsonpath(obj,'$.store..price')
# print(price_list)

# 第三个书
# book = jsonpath.jsonpath(obj,'$..book[2]')
# print(book)

# 最后一本书
# book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
# print(book)

# 	前面的两本书
# book_list = jsonpath.jsonpath(obj,'$..book[0,1]')
# book_list = jsonpath.jsonpath(obj,'$..book[:2]')
# print(book_list)

# 条件过滤需要在()的前面添加一个?
# 	 过滤出所有的包含isbn的书。
# book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
# print(book_list)


# 哪本书超过了10块钱
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
print(book_list)

23、爬虫_解析_jsonpath解析淘票票

{"returnCode":"0","returnValue":{"A":[{"id":3643,"parentId":0,"regionName":"阿坝","cityCode":513200,"pinYin":"ABA"},{"id":3090,"parentId":0,"regionName":"阿克苏","cityCode":652900,"pinYin":"AKESU"},{"id":3632,"parentId":0,"regionName":"阿拉善","cityCode":152900,"pinYin":"ALASHAN"},{"id":899,"parentId":0,"regionName":"安康","cityCode":610900,"pinYin":"ANKANG"},{"id":196,"parentId":0,"regionName":"安庆","cityCode":340800,"pinYin":"ANQING"},{"id":758,"parentId":0,"regionName":"鞍山","cityCode":210300,"pinYin":"ANSHAN"},{"id":388,"parentId":0,"regionName":"安顺","cityCode":520400,"pinYin":"ANSHUN"},{"id":454,"parentId":0,"regionName":"安阳","cityCode":410500,"pinYin":"ANYANG"}],"B":[{"id":3633,"parentId":0,"regionName":"白城","cityCode":220800,"pinYin":"BAICHENG"},{"id":356,"parentId":0,"regionName":"百色","cityCode":451000,"pinYin":"BAISE"},{"id":634,"parentId":0,"regionName":"白山","cityCode":220600,"pinYin":"BAISHAN"},{"id":275,"parentId":0,"regionName":"白银","cityCode":620400,"pinYin":"BAIYIN"},{"id":426,"parentId":0,"regionName":"保定","cityCode":130600,"pinYin":"BAODING"},{"id":188,"parentId":0,"regionName":"宝鸡","cityCode":610300,"pinYin":"BAOJI"},{"id":994,"parentId":0,"regionName":"保山","cityCode":530500,"pinYin":"BAOSHAN"},{"id":1181,"parentId":0,"regionName":"包头","cityCode":150200,"pinYin":"BAOTOU"},{"id":789,"parentId":0,"regionName":"巴彦淖尔","cityCode":150800,"pinYin":"BAYANNAOER"},{"id":925,"parentId":0,"regionName":"巴中","cityCode":511900,"pinYin":"BAZHONG"},{"id":358,"parentId":0,"regionName":"北海","cityCode":450500,"pinYin":"BEIHAI"},{"id":3,"parentId":0,"regionName":"北京","cityCode":110100,"pinYin":"BEIJING"},{"id":200,"parentId":0,"regionName":"蚌埠","cityCode":340300,"pinYin":"BENGBU"},{"id":760,"parentId":0,"regionName":"本溪","cityCode":210500,"pinYin":"BENXI"},{"id":390,"parentId":0,"regionName":"毕节","cityCode":522401,"pinYin":"BIJIE"},{"id":824,"parentId":0,"regionName":"滨州","cityCode":371600,"pinYin":"BINZHOU"},{"id":1126,"parentId":0,"regionName":"亳州","cityCode":341600,"pinYin":"BOZHOU"},{"id":5860,"parentId":0,"regionName":"巴音郭楞","cityCode":652800,"pinYin":"BYGL"}],"C":[{"id":430,"parentId":0,"regionName":"沧州","cityCode":130900,"pinYin":"CANGZHOU"},{"id":623,"parentId":0,"regionName":"长春","cityCode":220100,"pinYin":"CHANGCHUN"},{"id":573,"parentId":0,"regionName":"常德","cityCode":430700,"pinYin":"CHANGDE"},{"id":983,"parentId":0,"regionName":"昌吉","cityCode":652300,"pinYin":"CHANGJI"},{"id":5781,"parentId":0,"regionName":"昌江","cityCode":469026,"pinYin":"CHANGJIANG"},{"id":576,"parentId":0,"regionName":"长沙","cityCode":430100,"pinYin":"CHANGSHA"},{"id":883,"parentId":0,"regionName":"长治","cityCode":140400,"pinYin":"CHANGZHI"},{"id":651,"parentId":0,"regionName":"常州","cityCode":320400,"pinYin":"CHANGZHOU"},{"id":3244,"parentId":0,"regionName":"朝阳","cityCode":211300,"pinYin":"CHAOYANG"},{"id":1138,"parentId":0,"regionName":"潮州","cityCode":445100,"pinYin":"CHAOZHOU"},{"id":433,"parentId":0,"regionName":"承德","cityCode":130800,"pinYin":"CHENGDE"},{"id":70,"parentId":0,"regionName":"成都","cityCode":510100,"pinYin":"CHENGDU"},{"id":5859,"parentId":0,"regionName":"澄迈县","cityCode":469023,"pinYin":"CHENGMAI"},{"id":585,"parentId":0,"regionName":"郴州","cityCode":431000,"pinYin":"CHENZHOU"},{"id":791,"parentId":0,"regionName":"赤峰","cityCode":150400,"pinYin":"CHIFENG"},{"id":205,"parentId":0,"regionName":"池州","cityCode":341700,"pinYin":"CHIZHOU"},{"id":40,"parentId":0,"regionName":"重庆","cityCode":500100,"pinYin":"CHONGQING"},{"id":3640,"parentId":0,"regionName":"崇左","cityCode":451400,"pinYin":"CHONGZUO"},{"id":996,"parentId":0,"regionName":"楚雄","cityCode":532300,"pinYin":"CHUXIONG"},{"id":207,"parentId":0,"regionName":"滁州","cityCode":341100,"pinYin":"CHUZHOU"}],"D":[{"id":998,"parentId":0,"regionName":"大理","cityCode":532900,"pinYin":"DALI"},{"id":763,"parentId":0,"regionName":"大连","cityCode":210200,"pinYin":"DALIAN"},{"id":3071,"parentId":0,"regionName":"儋州","cityCode":460400,"pinYin":"DAN"},{"id":753,"parentId":0,"regionName":"丹东","cityCode":210600,"pinYin":"DANDONG"},{"id":514,"parentId":0,"regionName":"大庆","cityCode":230600,"pinYin":"DAQING"},{"id":885,"parentId":0,"regionName":"大同","cityCode":140200,"pinYin":"DATONG"},{"id":3638,"parentId":0,"regionName":"大兴安岭","cityCode":232700,"pinYin":"DAXINGANLING"},{"id":935,"parentId":0,"regionName":"达州","cityCode":511700,"pinYin":"DAZHOU"},{"id":3650,"parentId":0,"regionName":"德宏","cityCode":533100,"pinYin":"DEHONG"},{"id":937,"parentId":0,"regionName":"德阳","cityCode":510600,"pinYin":"DEYANG"},{"id":827,"parentId":0,"regionName":"德州","cityCode":371400,"pinYin":"DEZHOU"},{"id":5884,"parentId":0,"regionName":"定安","cityCode":469021,"pinYin":"DINGANXIAN"},{"id":1135,"parentId":0,"regionName":"定西","cityCode":621100,"pinYin":"DINGXI"},{"id":1000,"parentId":0,"regionName":"迪庆","cityCode":533400,"pinYin":"DIQINGZANGZU"},{"id":5742,"parentId":0,"regionName":"东方","cityCode":469007,"pinYin":"DONGFANG"},{"id":109,"parentId":0,"regionName":"东莞","cityCode":441900,"pinYin":"DONGGUAN"},{"id":829,"parentId":0,"regionName":"东营","cityCode":370500,"pinYin":"DONGYING"}],"E":[{"id":793,"parentId":0,"regionName":"鄂尔多斯","cityCode":150600,"pinYin":"EERDUOSI"},{"id":541,"parentId":0,"regionName":"恩施","cityCode":422800,"pinYin":"ENSHI"},{"id":543,"parentId":0,"regionName":"鄂州","cityCode":420700,"pinYin":"EZHOU"}],"F":[{"id":360,"parentId":0,"regionName":"防城港","cityCode":450600,"pinYin":"FANGCHENGGANG"},{"id":61,"parentId":0,"regionName":"佛山","cityCode":440600,"pinYin":"FOSHAN"},{"id":770,"parentId":0,"regionName":"抚顺","cityCode":210400,"pinYin":"FUSHUN"},{"id":1176,"parentId":0,"regionName":"阜新","cityCode":210900,"pinYin":"FUXIN"},{"id":1125,"parentId":0,"regionName":"阜阳","cityCode":341200,"pinYin":"FUYANG"},{"id":745,"parentId":0,"regionName":"抚州","cityCode":361000,"pinYin":"FUZHOU"},{"id":98,"parentId":0,"regionName":"福州","cityCode":350100,"pinYin":"FUZHOU"}],"G":[{"id":3658,"parentId":0,"regionName":"甘南","cityCode":623000,"pinYin":"GANNAN"},{"id":718,"parentId":0,"regionName":"赣州","cityCode":360700,"pinYin":"GANZHOU"},{"id":3644,"parentId":0,"regionName":"甘孜","cityCode":513300,"pinYin":"GANZI"},{"id":2166,"parentId":43,"regionName":"巩义市","cityCode":410181,"pinYin":"GONGYI","selected":1},{"id":3642,"parentId":0,"regionName":"广安","cityCode":511600,"pinYin":"GUANGAN"},{"id":3453,"parentId":0,"regionName":"广元","cityCode":510800,"pinYin":"GUANGYUAN"},{"id":8,"parentId":0,"regionName":"广州","cityCode":440100,"pinYin":"GUANGZHOU"},{"id":362,"parentId":0,"regionName":"贵港","cityCode":450800,"pinYin":"GUIGANG"},{"id":364,"parentId":0,"regionName":"桂林","cityCode":450300,"pinYin":"GUILIN"},{"id":394,"parentId":0,"regionName":"贵阳","cityCode":520100,"pinYin":"GUIYANG"},{"id":1183,"parentId":0,"regionName":"固原","cityCode":640400,"pinYin":"GUYUAN"}],"H":[{"id":508,"parentId":0,"regionName":"哈尔滨","cityCode":230100,"pinYin":"HAERBIN"},{"id":3659,"parentId":0,"regionName":"海东","cityCode":630200,"pinYin":"HAIDONG"},{"id":414,"parentId":0,"regionName":"海口","cityCode":460100,"pinYin":"HAIKOU"},{"id":5788,"parentId":0,"regionName":"海南州","cityCode":632500,"pinYin":"HAINANZHOU"},{"id":3665,"parentId":0,"regionName":"海西","cityCode":632800,"pinYin":"HAIXI"},{"id":3669,"parentId":0,"regionName":"哈密","cityCode":652200,"pinYin":"HAMI"},{"id":435,"parentId":0,"regionName":"邯郸","cityCode":130400,"pinYin":"HANDAN"},{"id":16,"parentId":0,"regionName":"杭州","cityCode":330100,"pinYin":"HANGZHOU","selected":0},{"id":902,"parentId":0,"regionName":"汉中","cityCode":610700,"pinYin":"HANZHONG"},{"id":460,"parentId":0,"regionName":"鹤壁","cityCode":410600,"pinYin":"HEBI"},{"id":1144,"parentId":0,"regionName":"河池","cityCode":451200,"pinYin":"HECHI"},{"id":210,"parentId":0,"regionName":"合肥","cityCode":340100,"pinYin":"HEFEI"},{"id":1154,"parentId":0,"regionName":"鹤岗","cityCode":230400,"pinYin":"HEGANG"},{"id":3637,"parentId":0,"regionName":"黑河","cityCode":231100,"pinYin":"HEIHE"},{"id":1148,"parentId":0,"regionName":"衡水","cityCode":131100,"pinYin":"HENGSHUI"},{"id":587,"parentId":0,"regionName":"衡阳","cityCode":430400,"pinYin":"HENGYANG"},{"id":3673,"parentId":0,"regionName":"和田","cityCode":653200,"pinYin":"HETIAN"},{"id":319,"parentId":0,"regionName":"河源","cityCode":441600,"pinYin":"HEYUAN"},{"id":832,"parentId":0,"regionName":"菏泽","cityCode":371700,"pinYin":"HEZE"},{"id":370,"parentId":0,"regionName":"贺州","cityCode":451100,"pinYin":"HEZHOU"},{"id":1002,"parentId":0,"regionName":"红河","cityCode":532500,"pinYin":"HONGHE"},{"id":666,"parentId":0,"regionName":"淮安","cityCode":320800,"pinYin":"HUAIAN"},{"id":1127,"parentId":0,"regionName":"淮北","cityCode":340600,"pinYin":"HUAIBEI"},{"id":590,"parentId":0,"regionName":"怀化","cityCode":431200,"pinYin":"HUAIHUA"},{"id":215,"parentId":0,"regionName":"淮南","cityCode":340400,"pinYin":"HUAINAN"},{"id":547,"parentId":0,"regionName":"黄冈","cityCode":421100,"pinYin":"HUANGGANG"},{"id":3661,"parentId":0,"regionName":"黄南","cityCode":632300,"pinYin":"HUANGNAN"},{"id":217,"parentId":0,"regionName":"黄山","cityCode":341000,"pinYin":"HUANGSHAN"},{"id":550,"parentId":0,"regionName":"黄石","cityCode":420200,"pinYin":"HUANGSHI"},{"id":796,"parentId":0,"regionName":"呼和浩特","cityCode":150100,"pinYin":"HUHEHAOTE"},{"id":163,"parentId":0,"regionName":"惠州","cityCode":441300,"pinYin":"HUIZHOU"},{"id":776,"parentId":0,"regionName":"葫芦岛","cityCode":211400,"pinYin":"HULUDAO"},{"id":801,"parentId":0,"regionName":"呼伦贝尔","cityCode":150700,"pinYin":"HULUNBEIER"},{"id":173,"parentId":0,"regionName":"湖州","cityCode":330500,"pinYin":"HUZHOU"}],"J":[{"id":523,"parentId":0,"regionName":"佳木斯","cityCode":230800,"pinYin":"JIAMUSI"},{"id":747,"parentId":0,"regionName":"吉安","cityCode":360800,"pinYin":"JIAN"},{"id":317,"parentId":0,"regionName":"江门","cityCode":440700,"pinYin":"JIANGMEN"},{"id":462,"parentId":0,"regionName":"焦作","cityCode":410800,"pinYin":"JIAOZUO"},{"id":156,"parentId":0,"regionName":"嘉兴","cityCode":330400,"pinYin":"JIAXING"},{"id":1136,"parentId":0,"regionName":"嘉峪关","cityCode":620200,"pinYin":"JIAYUGUAN"},{"id":327,"parentId":0,"regionName":"揭阳","cityCode":445200,"pinYin":"JIEYANG"},{"id":628,"parentId":0,"regionName":"吉林","cityCode":220200,"pinYin":"JILIN"},{"id":837,"parentId":0,"regionName":"济南","cityCode":370100,"pinYin":"JINAN"},{"id":3556,"parentId":0,"regionName":"金昌","cityCode":620300,"pinYin":"JINCHANG"},{"id":892,"parentId":0,"regionName":"晋城","cityCode":140500,"pinYin":"JINCHENG"},{"id":724,"parentId":0,"regionName":"景德镇","cityCode":360200,"pinYin":"JINGDEZHEN"},{"id":536,"parentId":0,"regionName":"荆门","cityCode":420800,"pinYin":"JINGMEN"},{"id":545,"parentId":0,"regionName":"荆州","cityCode":421000,"pinYin":"JINGZHOU"},{"id":142,"parentId":0,"regionName":"金华","cityCode":330700,"pinYin":"JINHUA"},{"id":842,"parentId":0,"regionName":"济宁","cityCode":370800,"pinYin":"JINING"},{"id":894,"parentId":0,"regionName":"晋中","cityCode":140700,"pinYin":"JINZHONG"},{"id":779,"parentId":0,"regionName":"锦州","cityCode":210700,"pinYin":"JINZHOU"},{"id":726,"parentId":0,"regionName":"九江","cityCode":360400,"pinYin":"JIUJIANG"},{"id":277,"parentId":0,"regionName":"酒泉","cityCode":620900,"pinYin":"JIUQUAN"},{"id":521,"parentId":0,"regionName":"鸡西","cityCode":230300,"pinYin":"JIXI"},{"id":1102,"parentId":0,"regionName":"济源","cityCode":410881,"pinYin":"JIYUAN"}],"K":[{"id":466,"parentId":0,"regionName":"开封","cityCode":410200,"pinYin":"KAIFENG"},{"id":985,"parentId":0,"regionName":"喀什","cityCode":653100,"pinYin":"KASHEN"},{"id":3667,"parentId":0,"regionName":"克拉玛依","cityCode":650200,"pinYin":"KELAMAYI"},{"id":3672,"parentId":0,"regionName":"克孜勒苏柯尔克孜","cityCode":653000,"pinYin":"KEZILESUKEERKEZI"},{"id":18,"parentId":0,"regionName":"昆明","cityCode":530100,"pinYin":"KUNMING"}],"L":[{"id":3639,"parentId":0,"regionName":"来宾","cityCode":451300,"pinYin":"LAIBIN"},{"id":419,"parentId":0,"regionName":"廊坊","cityCode":131000,"pinYin":"LANGFANG"},{"id":279,"parentId":0,"regionName":"兰州","cityCode":620100,"pinYin":"LANZHOU"},{"id":979,"parentId":0,"regionName":"拉萨","cityCode":540100,"pinYin":"LASA"},{"id":940,"parentId":0,"regionName":"乐山","cityCode":511100,"pinYin":"LESHAN"},{"id":3645,"parentId":0,"regionName":"凉山","cityCode":513400,"pinYin":"LIANGSHAN"},{"id":677,"parentId":0,"regionName":"连云港","cityCode":320700,"pinYin":"LIANYUNGANG"},{"id":847,"parentId":0,"regionName":"聊城","cityCode":371500,"pinYin":"LIAOCHENG"},{"id":1178,"parentId":0,"regionName":"辽阳","cityCode":211000,"pinYin":"LIAOYANG"},{"id":630,"parentId":0,"regionName":"辽源","cityCode":220400,"pinYin":"LIAOYUAN"},{"id":992,"parentId":0,"regionName":"丽江","cityCode":530700,"pinYin":"LIJIANG"},{"id":1008,"parentId":0,"regionName":"临沧","cityCode":530900,"pinYin":"LINCANG"},{"id":890,"parentId":0,"regionName":"临汾","cityCode":141000,"pinYin":"LINFEN"},{"id":5590,"parentId":0,"regionName":"临高","cityCode":469024,"pinYin":"LINGAO"},{"id":3498,"parentId":0,"regionName":"临夏","cityCode":622900,"pinYin":"LINXIA"},{"id":849,"parentId":0,"regionName":"临沂","cityCode":371300,"pinYin":"LINYI"},{"id":3657,"parentId":0,"regionName":"林芝","cityCode":542600,"pinYin":"LINZHI"},{"id":1039,"parentId":0,"regionName":"丽水","cityCode":331100,"pinYin":"LISHUI"},{"id":227,"parentId":0,"regionName":"六安","cityCode":341500,"pinYin":"LIUAN"},{"id":406,"parentId":0,"regionName":"六盘水","cityCode":520200,"pinYin":"LIUPANSHUI"},{"id":380,"parentId":0,"regionName":"柳州","cityCode":450200,"pinYin":"LIUZHOU"},{"id":288,"parentId":0,"regionName":"陇南","cityCode":621200,"pinYin":"LONGNAN"},{"id":263,"parentId":0,"regionName":"龙岩","cityCode":350800,"pinYin":"LONGYAN"},{"id":595,"parentId":0,"regionName":"娄底","cityCode":431300,"pinYin":"LOUDI"},{"id":5863,"parentId":0,"regionName":"陵水","cityCode":469028,"pinYin":"LS"},{"id":1194,"parentId":0,"regionName":"吕梁","cityCode":141100,"pinYin":"LULIANG"},{"id":495,"parentId":0,"regionName":"漯河","cityCode":411100,"pinYin":"LUOHE"},{"id":486,"parentId":0,"regionName":"洛阳","cityCode":410300,"pinYin":"LUOYANG"},{"id":959,"parentId":0,"regionName":"泸州","cityCode":510500,"pinYin":"LUZHOU"}],"M":[{"id":170,"parentId":0,"regionName":"马鞍山","cityCode":340500,"pinYin":"MAANSHAN"},{"id":348,"parentId":0,"regionName":"茂名","cityCode":440900,"pinYin":"MAOMING"},{"id":961,"parentId":0,"regionName":"眉山","cityCode":511400,"pinYin":"MEISHAN"},{"id":350,"parentId":0,"regionName":"梅州","cityCode":441400,"pinYin":"MEIZHOU"},{"id":944,"parentId":0,"regionName":"绵阳","cityCode":510700,"pinYin":"MIANYANG"},{"id":528,"parentId":0,"regionName":"牡丹江","cityCode":231000,"pinYin":"MUDANJIANG"}],"N":[{"id":738,"parentId":0,"regionName":"南昌","cityCode":360100,"pinYin":"NANCHANG"},{"id":968,"parentId":0,"regionName":"南充","cityCode":511300,"pinYin":"NANCHONG"},{"id":63,"parentId":0,"regionName":"南京","cityCode":320100,"pinYin":"NANJING"},{"id":372,"parentId":0,"regionName":"南宁","cityCode":450100,"pinYin":"NANNING"},{"id":254,"parentId":0,"regionName":"南平","cityCode":350700,"pinYin":"NANPING"},{"id":132,"parentId":0,"regionName":"南通","cityCode":320600,"pinYin":"NANTONG"},{"id":499,"parentId":0,"regionName":"南阳","cityCode":411300,"pinYin":"NANYANG"},{"id":970,"parentId":0,"regionName":"内江","cityCode":511000,"pinYin":"NEIJIANG"},{"id":147,"parentId":0,"regionName":"宁波","cityCode":330200,"pinYin":"NINGBO"},{"id":268,"parentId":0,"regionName":"宁德","cityCode":350900,"pinYin":"NINGDE"},{"id":3651,"parentId":0,"regionName":"怒江","cityCode":533300,"pinYin":"NUJIANG"}],"P":[{"id":784,"parentId":0,"regionName":"盘锦","cityCode":211100,"pinYin":"PANJIN"},{"id":951,"parentId":0,"regionName":"攀枝花","cityCode":510400,"pinYin":"PANZHIHUA"},{"id":502,"parentId":0,"regionName":"平顶山","cityCode":410400,"pinYin":"PINGDINGSHAN"},{"id":1137,"parentId":0,"regionName":"平凉","cityCode":620800,"pinYin":"PINGLIANG"},{"id":711,"parentId":0,"regionName":"萍乡","cityCode":360300,"pinYin":"PINGXIANG"},{"id":3198,"parentId":0,"regionName":"普洱","cityCode":530800,"pinYin":"PUER"},{"id":271,"parentId":0,"regionName":"莆田","cityCode":350300,"pinYin":"PUTIAN"},{"id":458,"parentId":0,"regionName":"濮阳","cityCode":410900,"pinYin":"PUYANG"}],"Q":[{"id":3647,"parentId":0,"regionName":"黔东南","cityCode":522600,"pinYin":"QIANDONGNAN"},{"id":1158,"parentId":0,"regionName":"潜江","cityCode":429005,"pinYin":"QIANJIANG"},{"id":3648,"parentId":0,"regionName":"黔南","cityCode":522700,"pinYin":"QIANNAN"},{"id":3646,"parentId":0,"regionName":"黔西南","cityCode":522300,"pinYin":"QIANXINAN"},{"id":51,"parentId":0,"regionName":"青岛","cityCode":370200,"pinYin":"QINGDAO"},{"id":3318,"parentId":0,"regionName":"庆阳","cityCode":621000,"pinYin":"QINGYANG"},{"id":102,"parentId":0,"regionName":"清远","cityCode":441800,"pinYin":"QINGYUAN"},{"id":446,"parentId":0,"regionName":"秦皇岛","cityCode":130300,"pinYin":"QINHUANGDAO"},{"id":1145,"parentId":0,"regionName":"钦州","cityCode":450700,"pinYin":"QINZHOU"},{"id":1124,"parentId":0,"regionName":"琼海","cityCode":469002,"pinYin":"QIONGHAI"},{"id":5851,"parentId":0,"regionName":"琼中","cityCode":469030,"pinYin":"QIONGZHONG"},{"id":530,"parentId":0,"regionName":"齐齐哈尔","cityCode":230200,"pinYin":"QIQIHAER"},{"id":3636,"parentId":0,"regionName":"七台河","cityCode":230900,"pinYin":"QITAIHE"},{"id":245,"parentId":0,"regionName":"泉州","cityCode":350500,"pinYin":"QUANZHOU"},{"id":1016,"parentId":0,"regionName":"曲靖","cityCode":530300,"pinYin":"QUJING"},{"id":145,"parentId":0,"regionName":"衢州","cityCode":330800,"pinYin":"QUZHOU"}],"R":[{"id":3654,"parentId":0,"regionName":"日喀则","cityCode":540200,"pinYin":"RIKEZE"},{"id":877,"parentId":0,"regionName":"日照","cityCode":371100,"pinYin":"RIZHAO"}],"S":[{"id":449,"parentId":0,"regionName":"三门峡","cityCode":411200,"pinYin":"SANMENXIA"},{"id":239,"parentId":0,"regionName":"三明","cityCode":350400,"pinYin":"SANMING"},{"id":410,"parentId":0,"regionName":"三亚","cityCode":460200,"pinYin":"SANYA"},{"id":1,"parentId":0,"regionName":"上海","cityCode":310100,"pinYin":"SHANGHAI"},{"id":897,"parentId":0,"regionName":"商洛","cityCode":611000,"pinYin":"SHANGLUO"},{"id":452,"parentId":0,"regionName":"商丘","cityCode":411400,"pinYin":"SHANGQIU"},{"id":713,"parentId":0,"regionName":"上饶","cityCode":361100,"pinYin":"SHANGRAO"},{"id":3653,"parentId":0,"regionName":"山南","cityCode":540500,"pinYin":"SHANNANSHI"},{"id":290,"parentId":0,"regionName":"汕头","cityCode":440500,"pinYin":"SHANTOU"},{"id":294,"parentId":0,"regionName":"汕尾","cityCode":441500,"pinYin":"SHANWEI"},{"id":296,"parentId":0,"regionName":"韶关","cityCode":440200,"pinYin":"SHAOGUAN"},{"id":66,"parentId":0,"regionName":"绍兴","cityCode":330600,"pinYin":"SHAOXING"},{"id":571,"parentId":0,"regionName":"邵阳","cityCode":430500,"pinYin":"SHAOYANG"},{"id":75,"parentId":0,"regionName":"沈阳","cityCode":210100,"pinYin":"SHENYANG"},{"id":28,"parentId":0,"regionName":"深圳","cityCode":440300,"pinYin":"SHENZHEN"},{"id":1200,"parentId":0,"regionName":"石河子","cityCode":659001,"pinYin":"SHIHEZI"},{"id":59,"parentId":0,"regionName":"石家庄","cityCode":130100,"pinYin":"SHIJIAZHUANG"},{"id":68,"parentId":0,"regionName":"十堰","cityCode":420300,"pinYin":"SHIYAN"},{"id":807,"parentId":0,"regionName":"石嘴山","cityCode":640200,"pinYin":"SHIZUISHAN"},{"id":3635,"parentId":0,"regionName":"双鸭山","cityCode":230500,"pinYin":"SHUANGYASHAN"},{"id":3629,"parentId":0,"regionName":"朔州","cityCode":140600,"pinYin":"SHUOZHOU"},{"id":621,"parentId":0,"regionName":"四平","cityCode":220300,"pinYin":"SIPING"},{"id":1174,"parentId":0,"regionName":"松原","cityCode":220700,"pinYin":"SONGYUAN"},{"id":511,"parentId":0,"regionName":"绥化","cityCode":231200,"pinYin":"SUIHUA"},{"id":922,"parentId":0,"regionName":"遂宁","cityCode":510900,"pinYin":"SUINING"},{"id":534,"parentId":0,"regionName":"随州","cityCode":421300,"pinYin":"SUIZHOU"},{"id":644,"parentId":0,"regionName":"宿迁","cityCode":321300,"pinYin":"SUQIAN"},{"id":193,"parentId":0,"regionName":"宿州","cityCode":341300,"pinYin":"SUZHOU"},{"id":107,"parentId":0,"regionName":"苏州","cityCode":320500,"pinYin":"SUZHOU"}],"T":[{"id":3674,"parentId":0,"regionName":"塔城","cityCode":654200,"pinYin":"TACHENG"},{"id":817,"parentId":0,"regionName":"泰安","cityCode":370900,"pinYin":"TAIAN"},{"id":81,"parentId":0,"regionName":"太原","cityCode":140100,"pinYin":"TAIYUAN"},{"id":181,"parentId":0,"regionName":"台州","cityCode":331000,"pinYin":"TAIZHOU"},{"id":640,"parentId":0,"regionName":"泰州","cityCode":321200,"pinYin":"TAIZHOU"},{"id":83,"parentId":0,"regionName":"唐山","cityCode":130200,"pinYin":"TANGSHAN"},{"id":22,"parentId":0,"regionName":"天津","cityCode":120100,"pinYin":"TIANJIN"},{"id":1159,"parentId":0,"regionName":"天门","cityCode":429006,"pinYin":"TIANMEN"},{"id":1119,"parentId":0,"regionName":"天水","cityCode":620500,"pinYin":"TIANSHUI"},{"id":1179,"parentId":0,"regionName":"铁岭","cityCode":211200,"pinYin":"TIELING"},{"id":1187,"parentId":0,"regionName":"铜川","cityCode":610200,"pinYin":"TONGCHUAN"},{"id":619,"parentId":0,"regionName":"通化","cityCode":220500,"pinYin":"TONGHUA"},{"id":787,"parentId":0,"regionName":"通辽","cityCode":150500,"pinYin":"TONGLIAO"},{"id":191,"parentId":0,"regionName":"铜陵","cityCode":340700,"pinYin":"TONGLING"},{"id":386,"parentId":0,"regionName":"铜仁","cityCode":522201,"pinYin":"TONGREN"}],"W":[{"id":5534,"parentId":0,"regionName":"万宁","cityCode":469006,"pinYin":"WANNING"},{"id":821,"parentId":0,"regionName":"潍坊","cityCode":370700,"pinYin":"WEIFANG"},{"id":853,"parentId":0,"regionName":"威海","cityCode":371000,"pinYin":"WEIHAI"},{"id":905,"parentId":0,"regionName":"渭南","cityCode":610500,"pinYin":"WEINAN"},{"id":5773,"parentId":0,"regionName":"文昌","cityCode":469005,"pinYin":"WENCHANG"},{"id":3269,"parentId":0,"regionName":"文山","cityCode":532600,"pinYin":"WENSHAN"},{"id":1047,"parentId":0,"regionName":"温州","cityCode":330300,"pinYin":"WENZHOU"},{"id":803,"parentId":0,"regionName":"乌海","cityCode":150300,"pinYin":"WUHAI"},{"id":10,"parentId":0,"regionName":"武汉","cityCode":420100,"pinYin":"WUHAN"},{"id":219,"parentId":0,"regionName":"芜湖","cityCode":340200,"pinYin":"WUHU"},{"id":5754,"parentId":0,"regionName":"五家渠","cityCode":659004,"pinYin":"WUJIAQU"},{"id":3630,"parentId":0,"regionName":"乌兰察布","cityCode":150900,"pinYin":"WULANCHABU"},{"id":987,"parentId":0,"regionName":"乌鲁木齐","cityCode":650100,"pinYin":"WULUMUQI"},{"id":284,"parentId":0,"regionName":"武威","cityCode":620600,"pinYin":"WUWEI"},{"id":151,"parentId":0,"regionName":"无锡","cityCode":320200,"pinYin":"WUXI"},{"id":3666,"parentId":0,"regionName":"吴忠","cityCode":640300,"pinYin":"WUZHONG"},{"id":374,"parentId":0,"regionName":"梧州","cityCode":450400,"pinYin":"WUZHOU"}],"X":[{"id":89,"parentId":0,"regionName":"厦门","cityCode":350200,"pinYin":"XIAMEN"},{"id":46,"parentId":0,"regionName":"西安","cityCode":610100,"pinYin":"XIAN"},{"id":599,"parentId":0,"regionName":"湘潭","cityCode":430300,"pinYin":"XIANGTAN"},{"id":602,"parentId":0,"regionName":"湘西","cityCode":433100,"pinYin":"XIANGXI"},{"id":731,"parentId":0,"regionName":"襄阳","cityCode":420600,"pinYin":"XIANGYANG"},{"id":538,"parentId":0,"regionName":"咸宁","cityCode":421200,"pinYin":"XIANNING"},{"id":569,"parentId":0,"regionName":"仙桃","cityCode":429004,"pinYin":"XIANTAO"},{"id":918,"parentId":0,"regionName":"咸阳","cityCode":610400,"pinYin":"XIANYANG"},{"id":1160,"parentId":0,"regionName":"孝感","cityCode":420900,"pinYin":"XIAOGAN"},{"id":3303,"parentId":0,"regionName":"锡林郭勒","cityCode":152500,"pinYin":"XILINGUOLE"},{"id":3631,"parentId":0,"regionName":"兴安盟","cityCode":152200,"pinYin":"XINGAN"},{"id":441,"parentId":0,"regionName":"邢台","cityCode":130500,"pinYin":"XINGTAI"},{"id":3679,"parentId":3646,"regionName":"兴义市","cityCode":522301,"pinYin":"XINGYI","selected":1},{"id":814,"parentId":0,"regionName":"西宁","cityCode":630100,"pinYin":"XINING"},{"id":472,"parentId":0,"regionName":"新乡","cityCode":410700,"pinYin":"XINXIANG"},{"id":470,"parentId":0,"regionName":"信阳","cityCode":411500,"pinYin":"XINYANG"},{"id":733,"parentId":0,"regionName":"新余","cityCode":360500,"pinYin":"XINYU"},{"id":3432,"parentId":0,"regionName":"忻州","cityCode":140900,"pinYin":"XINZHOU"},{"id":1010,"parentId":0,"regionName":"西双版纳","cityCode":532800,"pinYin":"XISHUANGBANNA"},{"id":224,"parentId":0,"regionName":"宣城","cityCode":341800,"pinYin":"XUANCHENG"},{"id":477,"parentId":0,"regionName":"许昌","cityCode":411000,"pinYin":"XUCHANG"},{"id":95,"parentId":0,"regionName":"徐州","cityCode":320300,"pinYin":"XUZHOU"}],"Y":[{"id":3438,"parentId":0,"regionName":"雅安","cityCode":511800,"pinYin":"YAAN"},{"id":912,"parentId":0,"regionName":"延安","cityCode":610600,"pinYin":"YANAN"},{"id":3634,"parentId":0,"regionName":"延边","cityCode":222400,"pinYin":"YANBIAN"},{"id":642,"parentId":0,"regionName":"盐城","cityCode":320900,"pinYin":"YANCHENG"},{"id":329,"parentId":0,"regionName":"阳江","cityCode":441700,"pinYin":"YANGJIANG"},{"id":5750,"parentId":0,"regionName":"洋浦","cityCode":469000,"pinYin":"YANGPU"},{"id":1195,"parentId":0,"regionName":"阳泉","cityCode":140300,"pinYin":"YANGQUAN"},{"id":660,"parentId":0,"regionName":"扬州","cityCode":321000,"pinYin":"YANGZHOU"},{"id":105,"parentId":0,"regionName":"烟台","cityCode":370600,"pinYin":"YANTAI"},{"id":949,"parentId":0,"regionName":"宜宾","cityCode":511500,"pinYin":"YIBIN"},{"id":565,"parentId":0,"regionName":"宜昌","cityCode":420500,"pinYin":"YICHANG"},{"id":3463,"parentId":0,"regionName":"伊春","cityCode":230700,"pinYin":"YICHUN"},{"id":716,"parentId":0,"regionName":"宜春","cityCode":360900,"pinYin":"YICHUN"},{"id":1104,"parentId":0,"regionName":"伊犁","cityCode":654000,"pinYin":"YILI"},{"id":810,"parentId":0,"regionName":"银川","cityCode":640100,"pinYin":"YINCHUAN"},{"id":774,"parentId":0,"regionName":"营口","cityCode":210800,"pinYin":"YINGKOU"},{"id":1170,"parentId":0,"regionName":"鹰潭","cityCode":360600,"pinYin":"YINGTAN"},{"id":4636,"parentId":151,"regionName":"宜兴市","cityCode":320282,"pinYin":"YIXINGSHI","selected":1},{"id":605,"parentId":0,"regionName":"益阳","cityCode":430900,"pinYin":"YIYANG"},{"id":1164,"parentId":0,"regionName":"永州","cityCode":431100,"pinYin":"YONGZHOU"},{"id":607,"parentId":0,"regionName":"岳阳","cityCode":430600,"pinYin":"YUEYANG"},{"id":378,"parentId":0,"regionName":"玉林","cityCode":450900,"pinYin":"YULIN"},{"id":914,"parentId":0,"regionName":"榆林","cityCode":610800,"pinYin":"YULIN"},{"id":888,"parentId":0,"regionName":"运城","cityCode":140800,"pinYin":"YUNCHENG"},{"id":332,"parentId":0,"regionName":"云浮","cityCode":445300,"pinYin":"YUNFU"},{"id":3664,"parentId":0,"regionName":"玉树","cityCode":632700,"pinYin":"YUSHU"},{"id":1012,"parentId":0,"regionName":"玉溪","cityCode":530400,"pinYin":"YUXI"}],"Z":[{"id":857,"parentId":0,"regionName":"枣庄","cityCode":370400,"pinYin":"ZAOZHUANG"},{"id":1236,"parentId":0,"regionName":"张家界","cityCode":430800,"pinYin":"ZHANGGUJIE"},{"id":443,"parentId":0,"regionName":"张家口","cityCode":130700,"pinYin":"ZHANGJIAKOU"},{"id":286,"parentId":0,"regionName":"张掖","cityCode":620700,"pinYin":"ZHANGYE"},{"id":243,"parentId":0,"regionName":"漳州","cityCode":350600,"pinYin":"ZHANGZHOU"},{"id":334,"parentId":0,"regionName":"湛江","cityCode":440800,"pinYin":"ZHANJIANG"},{"id":337,"parentId":0,"regionName":"肇庆","cityCode":441200,"pinYin":"ZHAOQING"},{"id":3649,"parentId":0,"regionName":"昭通","cityCode":530600,"pinYin":"ZHAOTONG"},{"id":43,"parentId":0,"regionName":"郑州","cityCode":410100,"pinYin":"ZHENGZHOU"},{"id":657,"parentId":0,"regionName":"镇江","cityCode":321100,"pinYin":"ZHENJIANG"},{"id":339,"parentId":0,"regionName":"中山","cityCode":442000,"pinYin":"ZHONGSHAN"},{"id":1184,"parentId":0,"regionName":"中卫","cityCode":640500,"pinYin":"ZHONGWEI"},{"id":93,"parentId":0,"regionName":"周口","cityCode":411600,"pinYin":"ZHOUKOU"},{"id":1055,"parentId":0,"regionName":"舟山","cityCode":330900,"pinYin":"ZHOUSHAN"},{"id":346,"parentId":0,"regionName":"珠海","cityCode":440400,"pinYin":"ZHUHAI"},{"id":484,"parentId":0,"regionName":"驻马店","cityCode":411700,"pinYin":"ZHUMADIAN"},{"id":597,"parentId":0,"regionName":"株洲","cityCode":430200,"pinYin":"ZHUZHOU"},{"id":860,"parentId":0,"regionName":"淄博","cityCode":370300,"pinYin":"ZIBO"},{"id":955,"parentId":0,"regionName":"自贡","cityCode":510300,"pinYin":"ZIGONG"},{"id":957,"parentId":0,"regionName":"资阳","cityCode":512000,"pinYin":"ZIYANG"},{"id":403,"parentId":0,"regionName":"遵义","cityCode":520300,"pinYin":"ZUNYI"}]}}

24、爬虫_解析_jsonpath解析淘票票



import urllib.request


url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers = {
    # ':authority': 'dianying.taobao.com',
    # ':method': 'GET',
    # ':path': '/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
    # ':scheme': 'https',
    'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': 'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9',
    'referer': 'https://dianying.taobao.com/',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
    'sec-ch-ua-mobile': '?0',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
}

request = urllib.request.Request(url = url, headers = headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

# split 切割
content = content.split('(')[1].split(')')[0]

with open('爬虫_解析_jsonpath解析淘票票.json','w',encoding='utf-8')as fp:
    fp.write(content)

import json
import jsonpath

obj = json.load(open('爬虫_解析_jsonpath解析淘票票.json','r',encoding='utf-8'))

city_list = jsonpath.jsonpath(obj,'$..regionName')

print(city_list)


25、爬虫_解析_bs4的基本使用

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>

    <div>
        <ul>
            <li id="l1">张三</li>
            <li id="l2">李四</li>
            <li>王五</li>
            <a href="" id="" class="a1">123</a>
            <span>嘿嘿嘿</span>
        </ul>
    </div>


    <a href="" title="a2">百度</a>

    <div id="d1">
        <span>
            哈哈哈
        </span>
    </div>

    <p id="p1" class="p1">呵呵呵</p>
</body>
</html>

26、爬虫_解析_bs4的基本使用



from bs4 import BeautifulSoup


# 通过解析本地文件 来将bs4的基础语法进行讲解
# 默认打开的文件的编码格式是gbk 所以在打开文件的时候需要指定编码
soup = BeautifulSoup(open('爬虫_解析_bs4的基本使用.html',encoding='utf-8'),'lxml')

# 根据标签名查找节点
# 找到的是第一个符合条件的数据
# print(soup.a)
# 获取标签的属性和属性值
# print(soup.a.attrs)

# bs4的一些函数
# (1)find
# 返回的是第一个符合条件的数据
# print(soup.find('a'))

# 根据title的值来找到对应的标签对象
# print(soup.find('a',title="a2"))

# 根据class的值来找到对应的标签对象  注意的是class需要添加下划线
# print(soup.find('a',class_="a1"))


# (2)find_all  返回的是一个列表 并且返回了所有的a标签
# print(soup.find_all('a'))

# 如果想获取的是多个标签的数据 那么需要在find_all的参数中添加的是列表的数据
# print(soup.find_all(['a','span']))

# limit的作用是查找前几个数据
# print(soup.find_all('li',limit=2))


# (3)select(推荐)
# select方法返回的是一个列表  并且会返回多个数据
# print(soup.select('a'))

# 可以通过.代表class  我们把这种操作叫做类选择器
# print(soup.select('.a1'))

# print(soup.select('#l1'))


# 属性选择器---通过属性来寻找对应的标签
# 查找到li标签中有id的标签
# print(soup.select('li[id]'))

# 查找到li标签中id为l2的标签
# print(soup.select('li[id="l2"]'))


# 层级选择器
#  后代选择器
# 找到的是div下面的li
# print(soup.select('div li'))

# 子代选择器
#  某标签的第一级子标签
# 注意:很多的计算机编程语言中 如果不加空格不会输出内容  但是在bs4中 不会报错 会显示内容
# print(soup.select('div > ul > li'))


# 找到a标签和li标签的所有的对象
# print(soup.select('a,li'))

# 节点信息
#    获取节点内容
# obj = soup.select('#d1')[0]
# 如果标签对象中 只有内容 那么string和get_text()都可以使用
# 如果标签对象中 除了内容还有标签 那么string就获取不到数据 而get_text()是可以获取数据
# 我们一般情况下  推荐使用get_text()
# print(obj.string)
# print(obj.get_text())

# 节点的属性
# obj = soup.select('#p1')[0]
# name是标签的名字
# print(obj.name)
# 将属性值左右一个字典返回
# print(obj.attrs)

# 获取节点的属性
obj = soup.select('#p1')[0]

print(obj.attrs.get('class'))
print(obj.get('class'))
print(obj['class'])

27、爬虫_解析_bs4爬取星巴克数据



import urllib.request

url = 'https://www.starbucks.com.cn/menu/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')


from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')

# //ul[@class="grid padded-3 product"]//strong/text()
name_list = soup.select('ul[class="grid padded-3 product"] strong')

for name in name_list:
    print(name.get_text())

28、爬虫_selenium_为什么要学习selenium


import urllib.request

url = 'https://www.jd.com/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')

print(content)

29、爬虫_selenium_基本使用


# (1)导入selenium
from selenium import webdriver

# (2) 创建浏览器操作对象

path = 'chromedriver.exe'

browser = webdriver.Chrome(path)

# (3)访问网站
# url = 'https://www.baidu.com'
#
# browser.get(url)

url = 'https://www.jd.com/'

browser.get(url)

# page_source获取网页源码
content = browser.page_source
print(content)


30、爬虫_selenium_元素定位



from selenium import webdriver

path = 'chromedriver.exe'
browser = webdriver.Chrome(path)

url = 'https://www.baidu.com'
browser.get(url)

# 元素定位

# 根据id来找到对象
# button = browser.find_element_by_id('su')
# print(button)

# 根据标签属性的属性值来获取对象的
# button = browser.find_element_by_name('wd')
# print(button)

# 根据xpath语句来获取对象
# button = browser.find_elements_by_xpath('//input[@id="su"]')
# print(button)

# 根据标签的名字来获取对象
# button = browser.find_elements_by_tag_name('input')
# print(button)

# 使用的bs4的语法来获取对象
# button = browser.find_elements_by_css_selector('#su')
# print(button)

# button = browser.find_element_by_link_text('直播')
# print(button)

31、爬虫_selenium_元素信息



from selenium import webdriver

path = 'chromedriver.exe'
browser = webdriver.Chrome(path)


url = 'http://www.baidu.com'
browser.get(url)


input = browser.find_element_by_id('su')

# 获取标签的属性
print(input.get_attribute('class'))
# 获取标签的名字
print(input.tag_name)

# 获取元素文本
a = browser.find_element_by_link_text('新闻')
print(a.text)


32、爬虫_selenium_交互



from selenium import webdriver

# 创建浏览器对象
path = 'chromedriver.exe'
browser = webdriver.Chrome(path)

# url
url = 'https://www.baidu.com'
browser.get(url)

import time
time.sleep(2)

# 获取文本框的对象
input = browser.find_element_by_id('kw')

# 在文本框中输入周杰伦
input.send_keys('周杰伦')

time.sleep(2)

# 获取百度一下的按钮
button = browser.find_element_by_id('su')

# 点击按钮
button.click()

time.sleep(2)

# 滑到底部
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)

time.sleep(2)

# 获取下一页的按钮
next = browser.find_element_by_xpath('//a[@class="n"]')

# 点击下一页
next.click()

time.sleep(2)

# 回到上一页
browser.back()

time.sleep(2)

# 回去
browser.forward()

time.sleep(3)

# 退出
browser.quit()

33、爬虫_selenium_phantomjs的基本使用



from selenium import webdriver

path = 'phantomjs.exe'

browser = webdriver.PhantomJS(path)


url = 'https://www.baidu.com'
browser.get(url)

browser.save_screenshot('baidu.png')

import time
time.sleep(2)

input = browser.find_element_by_id('kw')
input.send_keys('昆凌')

time.sleep(3)

browser.save_screenshot('kunling.png')

34、爬虫_selenium_handless

# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
#
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
#
# # path是你自己的chrome浏览器的文件路径
# path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
# chrome_options.binary_location = path
#
# browser = webdriver.Chrome(chrome_options=chrome_options)
#
#
# url = 'https://www.baidu.com'
#
# browser.get(url)
#
# browser.save_screenshot('baidu.png')

# 封装的handless


from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def share_browser():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')

    # path是你自己的chrome浏览器的文件路径
    path = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
    chrome_options.binary_location = path

    browser = webdriver.Chrome(chrome_options=chrome_options)
    return browser

browser = share_browser()

url = 'https://www.baidu.com'

browser.get(url)


35、爬虫_requests_基本使用


import requests


url = 'http://www.baidu.com'


response = requests.get(url=url)

# 一个类型和六个属性
# Response类型
# print(type(response))

# 设置响应的编码格式
# response.encoding = 'utf-8'

# 以字符串的形式来返回了网页的源码
# print(response.text)

# 返回一个url地址
# print(response.url)

# 返回的是二进制的数据
# print(response.content)

# 返回响应的状态码
# print(response.status_code)

# 返回的是响应头
print(response.headers)

36、爬虫_requests_get请求


# urllib
# (1) 一个类型以及六个方法
# (2)get请求
# (3)post请求   百度翻译
# (4)ajax的get请求
# (5)ajax的post请求
# (6)cookie登陆 微博
# (7)代理


# requests
# (1)一个类型以及六个属性
# (2)get请求
# (3)post请求
# (4)代理
# (5)cookie  验证码


import requests

url = 'https://www.baidu.com/s'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'wd':'北京'
}


# url  请求资源路径
# params 参数
# kwargs 字典
response = requests.get(url=url,params=data,headers=headers)

content = response.text

print(content)

# 总结:
# (1)参数使用params传递
# (2)参数无需urlencode编码
# (3)不需要请求对象的定制
# (4)请求资源路径中的?可以加也可以不加

37、爬虫_requests_post请求



import requests

url = 'https://fanyi.baidu.com/sug'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'kw': 'eye'
}

# url 请求地址
# data 请求参数
# kwargs 字典
response = requests.post(url=url,data=data,headers=headers)

content =response.text

import json

obj = json.loads(content,encoding='utf-8')
print(obj)

# 总结:
# (1)post请求 是不需要编解码
# (2)post请求的参数是data
# (3)不需要请求对象的定制

38、爬虫_requests_代理



import requests

url = 'http://www.baidu.com/s?'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}

data = {
    'wd':'ip'
}


proxy = {
    'http':'212.129.251.55:16816'
}

response = requests.get(url = url,params=data,headers = headers,proxies = proxy)

content = response.text

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

39、爬虫_requests_cookie登陆古诗文网

# 通过登陆  然后进入到主页面


# 通过找登陆接口我们发现 登陆的时候需要的参数很多
# _VIEWSTATE: /m1O5dxmOo7f1qlmvtnyNyhhaUrWNVTs3TMKIsm1lvpIgs0WWWUCQHl5iMrvLlwnsqLUN6Wh1aNpitc4WnOt0So3k6UYdFyqCPI6jWSvC8yBA1Q39I7uuR4NjGo=
# __VIEWSTATEGENERATOR: C93BE1AE
# from: http://so.gushiwen.cn/user/collect.aspx
# email: 595165358@qq.com
# pwd: action
# code: PId7
# denglu: 登录

# 我们观察到_VIEWSTATE   __VIEWSTATEGENERATOR  code是一个可以变化的量

# 难点:(1)_VIEWSTATE   __VIEWSTATEGENERATOR  一般情况看不到的数据 都是在页面的源码中
#     我们观察到这两个数据在页面的源码中 所以我们需要获取页面的源码 然后进行解析就可以获取了
#     (2)验证码

import requests


# 这是登陆页面的url地址
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 获取页面的源码
response = requests.get(url = url,headers = headers)
content = response.text

# 解析页面源码  然后获取_VIEWSTATE   __VIEWSTATEGENERATOR
from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')

# 获取_VIEWSTATE
viewstate = soup.select('#__VIEWSTATE')[0].attrs.get('value')

# 获取__VIEWSTATEGENERATOR
viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')


# 获取验证码图片
code = soup.select('#imgCode')[0].attrs.get('src')
code_url = 'https://so.gushiwen.cn' + code

# 有坑
# import urllib.request
# urllib.request.urlretrieve(url=code_url,filename='code.jpg')
# requests里面有一个方法 session()  通过session的返回值 就能使用请求变成一个对象

session = requests.session()
# 验证码的url的内容
response_code = session.get(code_url)
# 注意此时要使用二进制数据  因为我们要使用的是图片的下载
content_code = response_code.content
# wb的模式就是将二进制数据写入到文件
with open('code.jpg','wb')as fp:
    fp.write(content_code)




# 获取了验证码的图片之后 下载到本地 然后观察验证码  观察之后 然后在控制台输入这个验证码 就可以将这个值给
# code的参数 就可以登陆

code_name = input('请输入你的验证码')


# 点击登陆
url_post = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'

data_post = {
    '__VIEWSTATE': viewstate,
    '__VIEWSTATEGENERATOR': viewstategenerator,
    'from': 'http://so.gushiwen.cn/user/collect.aspx',
    'email': '595165358@qq.com',
    'pwd': '.....',
    'code': code_name,
    'denglu': '登录',
}

response_post = session.post(url = url, headers = headers, data = data_post)

content_post = response_post.text

with open('gushiwen.html','w',encoding= ' utf-8')as fp:
    fp.write(content_post)


# 难点
# (1) 隐藏域
# (2) 验证码


40、爬虫_requests_超级鹰打码平台的使用

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
	chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001')	#用户中心>>软件ID 生成一个替换 96001
	im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
	print(chaojiying.PostPic(im, 1902))											#1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()


41、爬虫_scrapy_安装

# (1) pip install scrapy
# (2) 报错1: building 'twisted.test.raiser' extension
#              error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++
#              Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
#     解决1
#       http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
#       Twisted‑20.3.0‑cp37‑cp37m‑win_amd64.whl
#       cp是你的python版本
#       amd是你的操作系统的版本
#       下载完成之后 使用pip install twisted的路径  安装
#       切记安装完twisted 再次安装scrapy

# (3) 报错2  提示python -m pip install --upgrade pip
#      解决2   运行python -m pip install --upgrade pip

# (4) 报错3   win32的错误
#      解决3   pip install pypiwin32

# (5) anaconda

42、爬虫_scrapy_scrapyshell

# 进入到scrapy shell的终端  直接在window的终端中输入scrapy shell 域名
# 如果想看到一些高亮 或者 自动补全  那么可以安装ipython  pip install ipython


# scrapy shell www.baidu.com

文章来源地址https://uudwc.com/A/AZ4RE

原文地址:https://blog.csdn.net/qq_42740465/article/details/129432188

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处: 如若内容造成侵权/违法违规/事实不符,请联系站长进行投诉反馈,一经查实,立即删除!

h
上一篇 2023年06月25日 13:21
2.Python数据分析项目——旅游景点票价预测
下一篇 2023年06月25日 13:22