爬取日本疫情网站数据

2023年1月27日
用以记录我用来参加学校的一个实验室所做的笔试题。
import requests as rq
import requests
from bs4 import BeautifulSoup
import html5lib
import os
import datetime
import re
import linecache
import base64


all_m_link = []  # 定义存储所有月份链接的数组
m_number: int
today = datetime.datetime.today()
year = today.year
month = today.month
workdir = os.getcwd()


def get_m_links():  # 开始取月份地址
    url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html"  # 定义url
    response = rq.get(url)
    html_doc = response.text
    soup = BeautifulSoup(response.text, "html5lib")  # 使用beautifulsoup
    tag = soup.find("div", id="h2_free1")  # 寻找指定位置
    for m_links in tag.next_sibling.next_sibling.find_all('a', href=True):  # 向下移动两个元素，并取href
        all_m_link.append(m_links['href'])  # href写入数组

    # 月份取地址至此结束


def get_d_links(link, m):  # 对每个月份依次读取日期地址，存入文件
    url = link  # 定义url
    response = rq.get(url)
    html_doc = response.text
    soup = BeautifulSoup(response.text, "html5lib")  # 使用beautifulsoup
    tag = soup.find("div", id="h2_free1")  # 寻找指定位置
    file = open("dlinks/dlinks" + str(m) + ".txt", "w+")
    for d_links in tag.next_sibling.next_sibling.find_all('a', href=True):  # 向下移动两个元素，并取href
        d_str_links = d_links['href']
        if d_str_links == "":
            file.write("未取到\n")
        elif d_str_links.find('http://') != -1:
            d_str_links.strip().replace("http://", 'https://')
            file.write(d_str_links + "\n")
        else:
            file.write(d_str_links + "\n")


# 日期取地址至此结束


def get_info(link, y, m, d): # 对每个日期取指定信息并储存
    url = link  # 定义url
    error = False
    error2 = False
    try:
        response = rq.get(url)
        html_doc = response.text
    except:
        print("获取页面时发生未知错误，此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载')
        error = True
        return
    if not error:
        try:
            soup = BeautifulSoup(response.text, "html5lib", fromEncoding="UTF-8")  # 使用beautifulsoup，正则乱码，强制指定UTF-8
            tag = soup.find("div", class_="m-grid__col1")  # 寻找指定位置
            str_pdf = tag.find_all("a", string=re.compile("国内における都道府県別のPCR検査陽性者数.*"), limit=1)  # 正则找到pdf的标签位置并取出整个a标签
            str_image = tag.find('img', src=True)['src']  # 通过img标签，找到图像位置并取出src
        except:
            print('取链接过程中出现未知错误，，此文件为'+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将会跳过并继续下载')
            error2 = True
            return
        if not error2:
            if str_image.find('data:image/png') == -1:
                img_link = 'https://www.mhlw.go.jp' + str_image  # 图片地址
                try:
                    urldownload(img_link, workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png')
                except:
                    print("下载图片时发生未知错误，此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载')
                    print("此处取到的链接为：" + img_link)
            else:
                try:
                    imgdata = base64.b64decode(str_image.strip().replace("data:image/png;base64,", ''))
                    file = open(workdir + '\\info\\image\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.png', 'wb')
                    file.write(imgdata)
                    file.close()
                except:
                    print("下载图片时发生未知错误，此文件为"+str(y)+'年'+str(m)+'月'+str(d)+'日的数据。程序将跳过并继续下载')
            pdf_link = 'https://www.mhlw.go.jp' + str(str_pdf)[str(str_pdf).find('/'):str(str_pdf).find('pdf')+3]  # pdf地址，使用字符串截取取出href
            try:
                urldownload(pdf_link, workdir + '\\info\\pdf\\' + str(y) + '\\' + str(m) + '\\' + str(y) + '年' + str(m) + '月' + str(d) + '日.pdf')
            except:
                print("下载PDF时发生未知错误，此文件为" + str(y) + '年' + str(m) + '月' + str(d) + '日的数据。程序将跳过并继续下载')
                print("此处取到的链接为：" + pdf_link)


def urldownload(url, filename):  # 文件下载模块
    down_res = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(down_res.content)


def check_folder():
    i_year = year
    i_month = 1
    p_month = 1
    # dlinks
    dir_dlinks = workdir + '\\dlinks'
    if not os.path.exists(dir_dlinks):
        os.makedirs(dir_dlinks)
    # info
    dir_info = workdir + '\\info'
    if not os.path.exists(dir_info):
        os.makedirs(dir_info)
    # info/image
    dir_info_image = workdir + '\\info\\image'
    if not os.path.exists(dir_info_image):
        os.makedirs(dir_info_image)
    # info/pdf
    dir_info_pdf = workdir + '\\info\\pdf'
    if not os.path.exists(dir_info_pdf):
        os.makedirs(dir_info_pdf)
    # info/image/year    info/pdf/year 和月份
    while i_year > 2019:
        dir_pdf_year = workdir + '\\info\\pdf\\' + str(i_year)
        if not os.path.exists(dir_pdf_year):
            os.makedirs(dir_pdf_year)
        while p_month < 13:
            if not (i_year == 2020 and p_month < 6):
                dir_pdf_month = workdir + '\\info\\pdf\\' + str(i_year) + '\\' + str(p_month)
                if not os.path.exists(dir_pdf_month):
                    os.makedirs(dir_pdf_month)
            p_month = p_month + 1
        dir_image_year = workdir + '\\info\\image\\' + str(i_year)
        if not os.path.exists(dir_image_year):
            os.makedirs(dir_image_year)
        while i_month < 13:
            if not (i_year == 2020 and i_month < 6):
                dir_image_month = workdir + '\\info\\image\\' + str(i_year) + '\\' + str(i_month)
                if not os.path.exists(dir_image_month):
                    os.makedirs(dir_image_month)
            i_month = i_month + 1
        i_year = i_year - 1
        p_month = 1
        i_month = 1


def i_to_date(i):
    if i < month:
        this_year = year
        this_month = 12 - i
    elif i < month + 12:
        this_year = year - 1
        this_month = 12 - ((i - month) % 12)
    elif i < month + 24:
        this_year = year - 2
        this_month = 12 - ((i - month) % 12)
    elif i < month + 36:
        this_year = year - 3
        this_month = 12 - ((i - month) % 12)
    file_name = workdir + '\\dlinks\\dlinks' + str(i) + '.txt'
    file_line = len(open(file_name).readlines())
    this_day = 0
    url_old = ""
    sp = False
    while file_line > 0:
        if sp:
            if this_month == 12:
                this_month = 1
                this_year = this_year + 1
            else:
                this_month = this_month + 1
            this_day = 1
            sp = False
        if this_day == 0:
            if this_month == 1 and this_year == 2020:
                this_day = 22
            elif this_month == 1:
                this_month = 12
                this_year = this_year - 1
                this_day = 31
                sp = True
            elif this_month == 2 or this_month == 4 or this_month == 6 or this_month == 9 or this_month == 11:
                this_month = this_month - 1
                this_day = 31
                sp = True
            elif this_month == 8:
                this_month = this_month - 1
                this_day = 31
                sp = True
            elif this_month == 3:
                this_month = this_month - 1
                sp = True
                if (this_year % 4 == 0 and this_year % 100 != 0) or this_year % 400 == 0:
                    this_day = 29
                else:
                    this_day = 28
            else:
                this_month = this_month - 1
                this_day = 30
                sp = True
        url = linecache.getline(file_name, file_line).strip()
        if url.find('https://') == -1 and url.find('/stf/') >= 0:
            url = 'https://www.mhlw.go.jp' + url
        elif url.find('https://') == -1:
            print('发现记录错误，已排除')
            file_line = file_line - 1
            continue
        elif url_old == url:
            print('发现重复错误，已排除')
            file_line = file_line - 1
            continue

        file_line = file_line - 1
        get_info(url, this_year, this_month, this_day)
        url_old = url
        if not sp:
            this_day = this_day + 1


if __name__ == '__main__':
    check_folder()
    get_m_links()
    i = 0
    m_number = len(all_m_link)
    while i < m_number:
        get_d_links(all_m_link[i], i)
        i = i + 1
    i = 0  # 中途下载修改点
    while i < m_number - 5:
        i_to_date(i)
        i = i + 1
爬取日本疫情网站数据

2 回复

回复 TommyLJH 取消回复

文章分类