智一面的面试题提供python的测试题
使用地址:http://www.gtalent.cn/exam/interview?token=52cf92de494f4a8b6165d817a7279966

敲了130多行代码,利用协程实现漫画下载,亲测没问题,目前海贼王更新到930话,全部下载下来1小时左右,供大家参考,一起共勉。

  1. from gevent import monkey;monkey.patch_all()
  2. from gevent.pool import Pool
  3. from bs4 import BeautifulSoup
  4. from fake_useragent import UserAgent
  5. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  6.  
  7. import gevent
  8. import requests
  9. import time
  10. import os
  11. import shutil
  12.  
  13.  
  14. def getSource(urls, headers, types):
  15.     try:
  16.         # 实例化UserAgent类
  17.         user_agent = UserAgent()
  18.         # 为头文件随机分配User-Agent
  19.         headers['User-Agent'] = user_agent.random
  20.         # 禁用安全请求警告
  21.         requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  22.         # 实例化Session
  23.         request_session = requests.Session()
  24.         # 设置重连次数
  25.         request_session.mount('http://', requests.adapters.HTTPAdapter(max_retries=5))
  26.         request_session.mount('https://', requests.adapters.HTTPAdapter(max_retries=5))
  27.         # 执行请求
  28.         get_response = request_session.get(urls, headers=headers, verify=False, timeout=(10, 10))
  29.         # 关闭请求
  30.         request_session.close()
  31.         # 设置编码
  32.         get_response.encoding = 'UTF-8'
  33.         # 判断获取源码还是图片
  34.         if types == 'text':
  35.             get_response = get_response.text
  36.         if types == 'content':
  37.             get_response = get_response.content
  38.     except Exception as e:
  39.         print('getSource()函数异常:' + str(e))
  40.     else:
  41.         return get_response
  42.  
  43.  
  44. def sourceAnalysis(src, dic, typ):
  45.     # 定义章节链接、标题、内容列表
  46.     chapter_link = []
  47.     chapter_name = []
  48.     chapter_cont = []
  49.     # 实例化BeautifulSoup
  50.     soup = BeautifulSoup(src, 'html.parser')
  51.     # 解析章节链接和标题
  52.     if typ == 'chapter':
  53.         analysis_lists = soup.find_all(dic['label'], class_=dic['class'])
  54.         # 提取章节链接和标题
  55.         for i in range(len(analysis_lists)):
  56.             chapter_link.append(DOMAIN + analysis_lists[i].get('data-hreflink'))
  57.             chapter_name.append(analysis_lists[i].get_text().strip())
  58.         chapter_dic = {'chapter_link': chapter_link, 'chapter_name': chapter_name}
  59.         return chapter_dic
  60.     # 解析章节内图片链接
  61.     if typ == 'content':
  62.         analysis_lists = soup.find_all(dic['label'], class_=dic['class'])
  63.         # 提取章节内图片链接
  64.         for i in range(len(analysis_lists)):
  65.             chapter_cont.append(analysis_lists[i].get('data-src'))
  66.         return chapter_cont
  67.  
  68.  
  69. if __name__ == '__main__':
  70.     # 系统启动时间
  71.     start_time = time.time()
  72.  
  73.     # 定义常量
  74.     DOMAIN = 'https://www.mkzhan.com/'
  75.     REQUEST_URL = 'https://www.mkzhan.com/209871/'
  76.     HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  77.                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
  78.                'Connection': 'keep-alive',
  79.                'User-Agent': ''}
  80.     LINK_PROPERTY = {'label': 'a', 'class': 'j-chapter-link'}
  81.     IMAG_PROPERTY = {'label': 'img', 'class': 'lazy-read'}
  82.     POOL = Pool(100)
  83.     ROOT_PATH = "D:/OnePiece/"
  84.  
  85.     # 创建存储漫画文件夹,如果已有文件夹,则删除再新建
  86.     if os.path.exists(ROOT_PATH):
  87.         shutil.rmtree(ROOT_PATH)
  88.     os.mkdir(ROOT_PATH)
  89.  
  90.     # 获取目录页源码
  91.     function_run_time = time.time()
  92.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 获取目录页源码开始...")
  93.     catalog_source = getSource(REQUEST_URL, HEADERS, 'text')
  94.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 获取目录页源码完成...[ %.1fs ]" % (time.time() - function_run_time))
  95.  
  96.     # 解析章节信息
  97.     function_run_time = time.time()
  98.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 解析章节信息开始...")
  99.     chapter_info = sourceAnalysis(catalog_source, LINK_PROPERTY, 'chapter')
  100.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 解析章节信息完成...[ %.1fs ]" % (time.time() - function_run_time))
  101.  
  102.     # 获取每章节源码
  103.     function_run_time = time.time()
  104.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 获取每章节源码开始...")
  105.     get_source_worker = [POOL.spawn(getSource, url, HEADERS, 'text') for url in chapter_info['chapter_link']]
  106.     gevent.joinall(get_source_worker)
  107.     chapter_source = [source.value for source in get_source_worker]
  108.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 获取每章节源码完成...[ %.1fs ]" % (time.time() - function_run_time))
  109.  
  110.     # 解析章节内图片链接
  111.     function_run_time = time.time()
  112.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 解析章节内图片链接开始...")
  113.     get_imglink_worker = [POOL.spawn(sourceAnalysis, src, IMAG_PROPERTY, 'content') for src in chapter_source]
  114.     gevent.joinall(get_imglink_worker)
  115.     image_list = [link.value for link in get_imglink_worker]
  116.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 解析章节内图片链接完成...[ %.1fs ]" % (time.time() - function_run_time))
  117.  
  118.     # 下载漫画
  119.     for i in range(len(chapter_info['chapter_name'])):
  120.         function_run_time = time.time()
  121.         print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 下载 " + chapter_info['chapter_name'][i] + " 开始...")
  122.         get_images_worker = [POOL.spawn(getSource, url, HEADERS, 'content') for url in image_list[i]]
  123.         gevent.joinall(get_images_worker)
  124.         # 创建章节文件夹
  125.         save_path = ROOT_PATH + chapter_info['chapter_name'][i] + '/'
  126.         os.mkdir(save_path)
  127.         for j in range(len(get_images_worker)):
  128.             with open(save_path + str(j) + '.jpg', 'wb') as image:
  129.                 image.write(get_images_worker[j].value)
  130.         print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + " 下载 " + chapter_info['chapter_name'][i] + " 完成...[ %.1fs ]" % (time.time() - function_run_time))
  131.  
  132.     print(time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime()) + ' System executing done...[ %.1fs ]' % (time.time() - start_time))