warning 我不是卷怪
必须声明的是,这个英语课不是我报名的,我也没有上过。我不是卷怪,请相信我。
抓包
鲸打卡是一个微信小程序。他也可以网页扫码登录。网页抓包比较简单,只需要浏览器开发者模式即可。然而,现在遇到的问题是网页端音频404而手机可以播放,无法直接从网页获取音频直链,只能小程序抓包。
老规矩,crawl需要先sniffer package。众所周知,微信小程序比较难抓包。它强制SSL链接且使用白名单证书,使得中间人攻击难以进行,人家根本不认你自己生成的证书,加入可信根根证书也没用。
经尝试,安卓端完美失败。好在ios对微信好像做了一些限制,ios微信信任你在ios新添的证书。
打开我们的老朋友fiddler,导出根证书并在ios端导入。设置ios代理为你电脑ip:8888端口。
设置好filter,不然太乱了。
可以看到请求header中包含appid和apsid。经尝试,其中apsid就是登录状态了,去掉以后就403;appid去掉以后没什么影响,对应的应该是鲸打卡的二次部署小程序,我这里是普特英语
有一个请求GET /user/course/mine?offset=0&limit=10 HTTP/1.1
,返回是json,里面包含了course_id和course_name,而course_id就是我们要的。这里省略了json中很多无关的信息以及涉及隐私的信息。
{
"data": {
"count": 2,
"list": [
{
"course_id": 221170,
"title": "30天高级听力训练营",
"practice_count": 30,
"course_name": "30天高级听力训练营",
"alarm_state": 1
},
{
"course_id": 221139,
"title": "40天中级听力训练营",
"practice_count": 40,
"course_name": "40天中级听力训练营",
"alarm_state": 1
}
]
},
"err_code": 0,
"err_msg": "SUCCESS"
}
有可疑请求GET /user/unlock/calendar?course_id=221170&sequence_numbers=2
,很好理解,解锁这个课程的第二课,返回json如下(同样精简了一下)
{
"data": {
"course_number": 30,
"current_course_unlock": 1,
"unlock_next_number": 2,
"unlock_sequence_data": [
{
"calendar_id": 7566696,
"is_offline": false,
"submit_sequence": 2,
"valid_status": true
}
],
"unlock_status": 0
},
"err_code": 0,
"err_msg": "SUCCESS"
}
这里的重点就是calender_id
,下面马上会用到。以及course_number
表示总课时数。
重磅请求GET /user/get_theme?calendar_id=7566696 HTTP/1.1
,返回信息中包含了我们需要的内容!!
{
"data": {
"calendar_id": 7566696,
"calendar_left": 7566695,
"calendar_right": 7566697,
"calendar_title": "",
"course_id": 221170,
"course_title": "30天高级听力训练营",
"paper_id": "",
"pc_content": "[{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/XmAxYNP5zbx3nctPCxfifARKwJSiamEQ.mp3\",\"voice_name\":\"02完整版.mp3\",\"voice_duration\":87,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg\"}],\"key\":\"6DF9TA6W\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/nrWyni376ZBeARRpt2MQP6mN3ETnwbfW.mp3\",\"voice_name\":\"片段1.mp3\",\"voice_duration\":29,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg\"}],\"key\":\"Q9XNTU3E\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/bTbXHkJk5EyYMfjjjraw8rcmxWm6z64m.mp3\",\"voice_name\":\"讲解1.mp3\",\"voice_duration\":349,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/aaab4283-f9da-4b5a-b26c-167965665b46.jpg\"}],\"key\":\"T5K4SFCU\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/6iXXBFBT6ECFhaMf86W7KPJQHftBFStE.mp3\",\"voice_name\":\"片段2.mp3\",\"voice_duration\":29,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg\"}],\"key\":\"I8GT2JMB\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/C8w4x5xKmexZzmCCzH6AaYZAA6ciznTk.mp3\",\"voice_name\":\"讲解2.mp3\",\"voice_duration\":435,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/aaab4283-f9da-4b5a-b26c-167965665b46.jpg\"}],\"key\":\"HD8MMW7Z\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/axSCCiM3JfJNEJasij2wyGK4jn83sjGY.mp3\",\"voice_name\":\"片段3.mp3\",\"voice_duration\":30,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg\"}],\"key\":\"297VBBHY\"},{\"type\":\"voice\",\"content\":[{\"voice_url\":\"https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/yyc8pTE5hX7K4NmG4Ac23Z8zM6Gk3Csk.mp3\",\"voice_name\":\"讲解3.mp3\",\"voice_duration\":265,\"voice_avatar\":\"https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/aaab4283-f9da-4b5a-b26c-167965665b46.jpg\"}],\"key\":\"BNMAH4RB\"},{\"type\":\"pdf\",\"content\":[{\"url\":\"2020/08/25/in6Q8FayeX0R7Sl9QQ4iGGFOAdjkZhks.pdf\",\"name\":\"表达讲义 高级02.pdf\"}],\"key\":\"vefmgYMZ\"},{\"type\":\"pdf\",\"content\":[{\"url\":\"https://jingdaka-doc.oss-cn-shanghai.aliyuncs.com/document/2019/01/28/5kph8QnxbjzCSSTQy3EmmJi4kWjikeed.pdf\",\"name\":\"语音讲义 高级 02.pdf\"}],\"key\":\"MCQAQF74\"},{\"key\":\"GNI5BAXN\",\"type\":\"courseLink\",\"content\":[{\"appId\":\"\",\"courseId\":267189,\"courseName\":\"【免费试听音标讲解】节选5周语音基础课\",\"courseType\":2}]},{\"key\":\"X381LZ97\",\"type\":\"courseLink\",\"content\":[{\"appId\":\"\",\"courseId\":221018,\"courseName\":\"30天初级听力训练营\",\"courseType\":2}]},{\"key\":\"YBKRRMNF\",\"type\":\"courseLink\",\"content\":[{\"appId\":\"\",\"courseId\":221139,\"courseName\":\"40天中级听力训练营\",\"courseType\":2}]}]",
"status": 1
},
"err_code": 0,
"err_msg": "SUCCESS"
}
这里pc_content是一个dumps过的json字符串,意思应该是pc端资源!!包含了我们需要的资源直链!!我们json.loads()一下,马上就得到数据如下。同样,这里精简了很多内容。
[
{
"type": "voice",
"content": [
{
"voice_url": "https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/XmAxYNP5zbx3nctPCxfifARKwJSiamEQ.mp3",
"voice_name": "02完整版.mp3",
"voice_duration": 87,
"voice_avatar": "https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg"
}
],
"key": "6DF9TA6W"
},
{
"type": "voice",
"content": [
{
"voice_url": "https://cdn-qiye-voice.jingdaka.com/backend_voice/2019/01/28/nrWyni376ZBeARRpt2MQP6mN3ETnwbfW.mp3",
"voice_name": "片段1.mp3",
"voice_duration": 29,
"voice_avatar": "https://cdn-qiye.jingdaka.com/backend_pic/dst/poster/2018/10/18/341790d2-6382-4635-9b78-2c2bc2e00b4f.jpg"
}
],
"key": "Q9XNTU3E"
},
{
"type": "pdf",
"content": [
{
"url": "https://jingdaka-doc.oss-cn-shanghai.aliyuncs.com/document/2019/01/28/5kph8QnxbjzCSSTQy3EmmJi4kWjikeed.pdf",
"name": "语音讲义 高级 02.pdf"
}
],
"key": "MCQAQF74"
},
{
"key": "GNI5BAXN",
"type": "courseLink",
"content": [
{
"appId": "",
"courseId": 267189,
"courseName": "【免费试听音标讲解】节选5周语音基础课",
"courseType": 2
}
]
},
{
"key": "X381LZ97",
"type": "courseLink",
"content": [
{
"appId": "",
"courseId": 221018,
"courseName": "30天初级听力训练营",
"courseType": 2
}
]
},
{
"key": "YBKRRMNF",
"type": "courseLink",
"content": [
{
"appId": "",
"courseId": 221139,
"courseName": "40天中级听力训练营",
"courseType": 2
}
]
}
]
很漂亮啊!url和name都给了。我们后面只需要把type不是voice或者pdf的资源过滤掉,直接下载就好了。
wait!还记得电脑上404吗?没错。这里的voice_url一个都打不开,这个是普特英语的问题,他们官方后来修复了,但仍然只能在小程序端收听,网页端听不了。
在小程序中点击播放,查看抓包情况。
好家伙,原来小程序段需要额外GET /picture/getTranscodeVideoUrl?video_url=https....
,怪不得可以听。从返回json中再次截取url,就得到了实际的vioce url了
{
"data": {
"transcode_video_url": "https://cdn-transcode.jingdaka.com/backend_voice/2019/01/23/0fcb9422-843c-4ef7-aa85-6ce369ccb173.mp3"
},
"err_code": 0,
"err_msg": "SUCCESS"
}
最后python代码实现:
思路是先get_json把pc_content资源json保存到本地,再读取本地json获取真实voice_url并下载。
在下载资源时使用了异步aiohttp模块,显著提高了下载速度,跑满宽带了!可以作为一个asyncio典型例子。
这个需要你根据实际情况修改apsid。以及,我直接手动获取calender_id,懒得自动获取了...
# Code BY ZZX
#用来保存json
from json import loads as json_loads
from json import dumps as json_dumps
import requests
import os
# jdk:鲸打卡
class fuck_jdk(object):
base_url = 'https://apiopen.jingdaka.com/user/get_theme?calendar_id='
headers_ = {
'Host': 'apiopen.jingdaka.com',
'Connection': 'keep-alive',
'apsid': '自己设', # 最关键的参数,登录态id
'version': '8.3.9',
'appid': '自己设', # 微信小程序id,取决于基于鲸打卡二次开发上线的小程序
'content-type': 'application/json',
'Filter': 'test1',
'Accept-Encoding': 'gzip,compress,br,deflate',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x1800122c) NetType/WIFI Language/zh_CN',
'Referer': '自己设'
}
def __init__(self, add) -> None:
'''
:获取鲸打卡课程资源直链
:给calendar_id加add-1
:calendar_id为7566695时为高级01课
'''
calendar_id = 7566695 # 可修改
self.calendar_id = calendar_id + add - 1
def _getpage(self) -> dict:
'''
返回原始页面response json
'''
url_ = '{}{}'.format(fuck_jdk.base_url, self.calendar_id)
return requests.get(url=url_, headers=fuck_jdk.headers_).json()
def get_json_list(self) -> list:
'''
:返回值是一个 元素是字典的 列表,每个字典代表一个可供下载的资源
:每个字典中的type键的值是资源类型
:每个字典中的content键的值是一个单元素列表
:该元素是一个字典,其中包含资源url,资源文件名
'''
tmp = self._getpage()
return json_loads(tmp['data']["pc_content"])
@staticmethod
def get_pure_json_list(raw_list: list) -> list:
'''
过滤list中乱七八糟的链接
:只留下听力音频和pdf课件 可修改为自己想要的类型
'''
return list(
filter(
lambda x: True if x['type'] in (
'voice', 'pdf') else False, raw_list
)
)
@staticmethod
def down_content(json: str, path: str) -> None:
'''
下载指定json中的所有文件到指定存在的目录
:这里只能下载文件和音频,下载视频请自行设置item的key值
:注意,本方法为非协程,速度较慢
而且没有获取真实voice_url
'''
tmp = json_loads(json)
for item in tmp:
if item['type'] == 'voice': # 音频
url_, name = item['voice_url'], item['voice_name']
else: # 文件
url_, name = item['url'], item['name']
path_ = os.path.join(path, name)
content = requests.get(url=url_, headers=fuck_jdk.headers_).content
with open(path_, 'wb') as f:
f.write(content)
if __name__ == '__main__':
for i in range(1, 31):
inst = fuck_jdk(i) #貌似会造成内存浪费,但不管了。。
with open('json_gj/{:0>2d}.json'.format(i), 'w', encoding='utf8') as f:
raw_list = inst.get_json_list()
tmp = inst.get_pure_json_list(raw_list)
f.write(json_dumps(tmp, ensure_ascii=False, separators=(',', ':')))
print(inst.calendar_id)
# Code BY ZZX
#用来下载资源
from get_json import fuck_jdk as fuck
from json import loads as json_loads
import asyncio
import aiohttp
import aiofiles
import time
import os
a = time.time()
headers_ = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 14_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.18(0x1800122c) NetType/WIFI Language/zh_CN',
}
errs = []
def read_json(x):
'''
读取json文件并返回loads的结果
'''
json_name = 'json_zj/{:0>2d}.json'.format(x)
with open(json_name, 'r', encoding='utf8') as f:
tmp = f.read()
return json_loads(tmp)
async def write_content(url, i, name, type):
'''下载内容'''
async with aiohttp.ClientSession() as client_: #不要学我给每个任务创建一个session
try:
if type == 'voice': #获取真实voice_url
url = 'https://apiopen.jingdaka.com/picture/getTranscodeVideoUrl?video_url=' + url
async with client_.get(url, headers=fuck.headers_) as resp:
url = await resp.json()
url = url['data']['transcode_video_url']
async with client_.get(url, headers=headers_) as resp:
content = await resp.read()
with open('zj/{:0>2d}/{}'.format(i, name), 'wb') as f:
print('zj/{:0>2d}/{}'.format(i, name))
f.write(content)
except:
errs.append('zj/{:0>2d}/{}'.format(i, name))
def get_info(item) -> str:
if item['type'] == 'voice': # 音频
return item['content'][0]['voice_url'], item['content'][0]['voice_name'], item['type']
else: # 文件
return item['content'][0]['url'], item['content'][0]['name'], item['type']
def gather_():
# 仅实例化一个session
tasks = []
print('获取下载队列....')
for i in range(2, 31):
try:
os.mkdir('zj/{:0>2d}'.format(i)) # 上级目录必须存在!
except:
pass
down_list = read_json(i)
for item in down_list:
url_, name_, type_ = get_info(item)
if url_[0] != 'h':
url_ = 'https://jingdaka-doc.oss-cn-shanghai.aliyuncs.com/document/' + url_
tasks.append(write_content(url=url_, i=i, name=name_, type=type_))
return tasks
async def main():
print('开始下载....')
return await asyncio.gather(*gather_())
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
b = time.time()
if errs:
print('下载完毕,错误项:')
for x in errs:
print(x)
else:
print('下载完毕,无错误!')
print('总耗时', b-a)
Comments NOTHING