用以记录我用来参加学校的一个实验室所做的笔试题。
import requests as rq
import requests
from bs4 import BeautifulSoup
import html5lib
import os
import datetime
import re
import linecache
import base64
all_m_link = [] # 定义存储所有月份链接的数组
m_number: int
today = datetime.datetime.today()
year = today.year
month = today.month
workdir = os.getcwd()
def get_m_links(): # 开始取月份地址
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html" # 定义url
response = rq.get(url)
html_doc = response.text
soup = BeautifulSoup(response.text, "html5lib") # 使用beautifulsoup
tag = soup.find("div", id="h2_free1") # 寻找指定位置
for m_links in tag.next_sibling.next_sibling.find_all('a', href=True): # 向下移动两个元素,并取href
all_m_link.append(m_links['href']) # href写入数组
# 月份取地址至此结束
def get_d_links(link, m): # 对每个月份依次读取日期地址,存入文件
url = link # 定义url
response = rq.get(url)
html_doc = response.text
soup = BeautifulSoup(response.text, "html5lib") # 使用beautifulsoup
tag = soup.find("div", id="h2_free1") # 寻找指定位置
file = open("dlinks/dlinks" + str(m) + ".txt", "w+")
for d_links in tag.next_sibling.next_sibling.find_all('a', href=True): # 向下移动两个元素,并取href
d_str_links = d_links['href']
if d_str_links == "":
file.write("未取到\n")
elif d_str_links.find('http://') != -1:
d_str_links.strip().replace("http://", 'https://')
file.write(d_str_links + "\n")
else:
file.write(d_str_links + "\n")
# 日期取地址至此结束
def get_info(link, y, m, d): # 对每个日期取指定信息并储存
url = link # 定义url
error = False
error2 = False
try:
response = rq.get(url)
html_doc = response.text
except:
print("获取页面时发生未知错误,此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载')
error = True
return
if not error:
try:
soup = BeautifulSoup(response.text, "html5lib", fromEncoding="UTF-8") # 使用beautifulsoup,正则乱码,强制指定UTF-8
tag = soup.find("div", class_="m-grid__col1") # 寻找指定位置
str_pdf = tag.find_all("a", string=re.compile("国内における都道府県別のPCR検査陽性者数.*"), limit=1) # 正则找到pdf的标签位置并取出整个a标签
str_image = tag.find('img', src=True)['src'] # 通过img标签,找到图像位置并取出src
except:
print('取链接过程中出现未知错误,,此文件为'+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将会跳过并继续下载')
error2 = True
return
if not error2:
if str_image.find('data:image/png') == -1:
img_link = 'https://www.mhlw.go.jp' + str_image # 图片地址
try:
urldownload(img_link, workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png')
except:
print("下载图片时发生未知错误,此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载')
print("此处取到的链接为:" + img_link)
else:
try:
imgdata = base64.b64decode(str_image.strip().replace("data:image/png;base64,", ''))
file = open(workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png', 'wb')
file.write(imgdata)
file.close()
except:
print("下载图片时发生未知错误,此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载')
pdf_link = 'https://www.mhlw.go.jp' + str(str_pdf)[str(str_pdf).find('/'):str(str_pdf).find('pdf')+3] # pdf地址,使用字符串截取取出href
try:
urldownload(pdf_link, workdir + '\\info\\pdf\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.pdf')
except:
print("下载PDF时发生未知错误,此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载')
print("此处取到的链接为:" + pdf_link)
def urldownload(url, filename): # 文件下载模块
down_res = requests.get(url)
with open(filename, 'wb') as file:
file.write(down_res.content)
def check_folder():
i_year = year
i_month = 1
p_month = 1
# dlinks
dir_dlinks = workdir + '\\dlinks'
if not os.path.exists(dir_dlinks):
os.makedirs(dir_dlinks)
# info
dir_info = workdir + '\\info'
if not os.path.exists(dir_info):
os.makedirs(dir_info)
# info/image
dir_info_image = workdir + '\\info\\image'
if not os.path.exists(dir_info_image):
os.makedirs(dir_info_image)
# info/pdf
dir_info_pdf = workdir + '\\info\\pdf'
if not os.path.exists(dir_info_pdf):
os.makedirs(dir_info_pdf)
# info/image/year info/pdf/year 和月份
while i_year > 2019:
dir_pdf_year = workdir + '\\info\\pdf\\' + str(i_year)
if not os.path.exists(dir_pdf_year):
os.makedirs(dir_pdf_year)
while p_month < 13:
if not (i_year == 2020 and p_month < 6):
dir_pdf_month = workdir + '\\info\\pdf\\' + str(i_year) + '\\' + str(p_month)
if not os.path.exists(dir_pdf_month):
os.makedirs(dir_pdf_month)
p_month = p_month + 1
dir_image_year = workdir + '\\info\\image\\' + str(i_year)
if not os.path.exists(dir_image_year):
os.makedirs(dir_image_year)
while i_month < 13:
if not (i_year == 2020 and i_month < 6):
dir_image_month = workdir + '\\info\\image\\' + str(i_year) + '\\' + str(i_month)
if not os.path.exists(dir_image_month):
os.makedirs(dir_image_month)
i_month = i_month + 1
i_year = i_year - 1
p_month = 1
i_month = 1
def i_to_date(i):
if i < month:
this_year = year
this_month = 12 - i
elif i < month + 12:
this_year = year - 1
this_month = 12 - ((i - month) % 12)
elif i < month + 24:
this_year = year - 2
this_month = 12 - ((i - month) % 12)
elif i < month + 36:
this_year = year - 3
this_month = 12 - ((i - month) % 12)
file_name = workdir + '\\dlinks\\dlinks' + str(i) + '.txt'
file_line = len(open(file_name).readlines())
this_day = 0
url_old = ""
sp = False
while file_line > 0:
if sp:
if this_month == 12:
this_month = 1
this_year = this_year + 1
else:
this_month = this_month + 1
this_day = 1
sp = False
if this_day == 0:
if this_month == 1 and this_year == 2020:
this_day = 22
elif this_month == 1:
this_month = 12
this_year = this_year - 1
this_day = 31
sp = True
elif this_month == 2 or this_month == 4 or this_month == 6 or this_month == 9 or this_month == 11:
this_month = this_month - 1
this_day = 31
sp = True
elif this_month == 8:
this_month = this_month - 1
this_day = 31
sp = True
elif this_month == 3:
this_month = this_month - 1
sp = True
if (this_year % 4 == 0 and this_year % 100 != 0) or this_year % 400 == 0:
this_day = 29
else:
this_day = 28
else:
this_month = this_month - 1
this_day = 30
sp = True
url = linecache.getline(file_name, file_line).strip()
if url.find('https://') == -1 and url.find('/stf/') >= 0:
url = 'https://www.mhlw.go.jp' + url
elif url.find('https://') == -1:
print('发现记录错误,已排除')
file_line = file_line - 1
continue
elif url_old == url:
print('发现重复错误,已排除')
file_line = file_line - 1
continue
file_line = file_line - 1
get_info(url, this_year, this_month, this_day)
url_old = url
if not sp:
this_day = this_day + 1
if __name__ == '__main__':
check_folder()
get_m_links()
i = 0
m_number = len(all_m_link)
while i < m_number:
get_d_links(all_m_link[i], i)
i = i + 1
i = 0 # 中途下载修改点
while i < m_number - 5:
i_to_date(i)
i = i + 1
牛!
>_<