用以记录我用来参加学校的一个实验室所做的笔试题。
import requests as rq import requests from bs4 import BeautifulSoup import html5lib import os import datetime import re import linecache import base64 all_m_link = [] # 定义存储所有月份链接的数组 m_number: int today = datetime.datetime.today() year = today.year month = today.month workdir = os.getcwd() def get_m_links(): # 开始取月份地址 url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html" # 定义url response = rq.get(url) html_doc = response.text soup = BeautifulSoup(response.text, "html5lib") # 使用beautifulsoup tag = soup.find("div", id="h2_free1") # 寻找指定位置 for m_links in tag.next_sibling.next_sibling.find_all('a', href=True): # 向下移动两个元素,并取href all_m_link.append(m_links['href']) # href写入数组 # 月份取地址至此结束 def get_d_links(link, m): # 对每个月份依次读取日期地址,存入文件 url = link # 定义url response = rq.get(url) html_doc = response.text soup = BeautifulSoup(response.text, "html5lib") # 使用beautifulsoup tag = soup.find("div", id="h2_free1") # 寻找指定位置 file = open("dlinks/dlinks" + str(m) + ".txt", "w+") for d_links in tag.next_sibling.next_sibling.find_all('a', href=True): # 向下移动两个元素,并取href d_str_links = d_links['href'] if d_str_links == "": file.write("未取到\n") elif d_str_links.find('http://') != -1: d_str_links.strip().replace("http://", 'https://') file.write(d_str_links + "\n") else: file.write(d_str_links + "\n") # 日期取地址至此结束 def get_info(link, y, m, d): # 对每个日期取指定信息并储存 url = link # 定义url error = False error2 = False try: response = rq.get(url) html_doc = response.text except: print("获取页面时发生未知错误,此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载') error = True return if not error: try: soup = BeautifulSoup(response.text, "html5lib", fromEncoding="UTF-8") # 使用beautifulsoup,正则乱码,强制指定UTF-8 tag = soup.find("div", class_="m-grid__col1") # 寻找指定位置 str_pdf = tag.find_all("a", string=re.compile("国内における都道府県別のPCR検査陽性者数.*"), limit=1) # 正则找到pdf的标签位置并取出整个a标签 str_image = tag.find('img', src=True)['src'] # 通过img标签,找到图像位置并取出src except: print('取链接过程中出现未知错误,,此文件为'+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将会跳过并继续下载') error2 = True return if not error2: if str_image.find('data:image/png') == -1: img_link = 'https://www.mhlw.go.jp' + str_image # 图片地址 try: urldownload(img_link, workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png') except: print("下载图片时发生未知错误,此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载') print("此处取到的链接为:" + img_link) else: try: imgdata = base64.b64decode(str_image.strip().replace("data:image/png;base64,", '')) file = open(workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png', 'wb') file.write(imgdata) file.close() except: print("下载图片时发生未知错误,此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载') pdf_link = 'https://www.mhlw.go.jp' + str(str_pdf)[str(str_pdf).find('/'):str(str_pdf).find('pdf')+3] # pdf地址,使用字符串截取取出href try: urldownload(pdf_link, workdir + '\\info\\pdf\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.pdf') except: print("下载PDF时发生未知错误,此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载') print("此处取到的链接为:" + pdf_link) def urldownload(url, filename): # 文件下载模块 down_res = requests.get(url) with open(filename, 'wb') as file: file.write(down_res.content) def check_folder(): i_year = year i_month = 1 p_month = 1 # dlinks dir_dlinks = workdir + '\\dlinks' if not os.path.exists(dir_dlinks): os.makedirs(dir_dlinks) # info dir_info = workdir + '\\info' if not os.path.exists(dir_info): os.makedirs(dir_info) # info/image dir_info_image = workdir + '\\info\\image' if not os.path.exists(dir_info_image): os.makedirs(dir_info_image) # info/pdf dir_info_pdf = workdir + '\\info\\pdf' if not os.path.exists(dir_info_pdf): os.makedirs(dir_info_pdf) # info/image/year info/pdf/year 和月份 while i_year > 2019: dir_pdf_year = workdir + '\\info\\pdf\\' + str(i_year) if not os.path.exists(dir_pdf_year): os.makedirs(dir_pdf_year) while p_month < 13: if not (i_year == 2020 and p_month < 6): dir_pdf_month = workdir + '\\info\\pdf\\' + str(i_year) + '\\' + str(p_month) if not os.path.exists(dir_pdf_month): os.makedirs(dir_pdf_month) p_month = p_month + 1 dir_image_year = workdir + '\\info\\image\\' + str(i_year) if not os.path.exists(dir_image_year): os.makedirs(dir_image_year) while i_month < 13: if not (i_year == 2020 and i_month < 6): dir_image_month = workdir + '\\info\\image\\' + str(i_year) + '\\' + str(i_month) if not os.path.exists(dir_image_month): os.makedirs(dir_image_month) i_month = i_month + 1 i_year = i_year - 1 p_month = 1 i_month = 1 def i_to_date(i): if i < month: this_year = year this_month = 12 - i elif i < month + 12: this_year = year - 1 this_month = 12 - ((i - month) % 12) elif i < month + 24: this_year = year - 2 this_month = 12 - ((i - month) % 12) elif i < month + 36: this_year = year - 3 this_month = 12 - ((i - month) % 12) file_name = workdir + '\\dlinks\\dlinks' + str(i) + '.txt' file_line = len(open(file_name).readlines()) this_day = 0 url_old = "" sp = False while file_line > 0: if sp: if this_month == 12: this_month = 1 this_year = this_year + 1 else: this_month = this_month + 1 this_day = 1 sp = False if this_day == 0: if this_month == 1 and this_year == 2020: this_day = 22 elif this_month == 1: this_month = 12 this_year = this_year - 1 this_day = 31 sp = True elif this_month == 2 or this_month == 4 or this_month == 6 or this_month == 9 or this_month == 11: this_month = this_month - 1 this_day = 31 sp = True elif this_month == 8: this_month = this_month - 1 this_day = 31 sp = True elif this_month == 3: this_month = this_month - 1 sp = True if (this_year % 4 == 0 and this_year % 100 != 0) or this_year % 400 == 0: this_day = 29 else: this_day = 28 else: this_month = this_month - 1 this_day = 30 sp = True url = linecache.getline(file_name, file_line).strip() if url.find('https://') == -1 and url.find('/stf/') >= 0: url = 'https://www.mhlw.go.jp' + url elif url.find('https://') == -1: print('发现记录错误,已排除') file_line = file_line - 1 continue elif url_old == url: print('发现重复错误,已排除') file_line = file_line - 1 continue file_line = file_line - 1 get_info(url, this_year, this_month, this_day) url_old = url if not sp: this_day = this_day + 1 if __name__ == '__main__': check_folder() get_m_links() i = 0 m_number = len(all_m_link) while i < m_number: get_d_links(all_m_link[i], i) i = i + 1 i = 0 # 中途下载修改点 while i < m_number - 5: i_to_date(i) i = i + 1
牛!
>_<