8.1 空氣品質監測網

這個網站可以讓我們根據選擇的地區看到空氣品質數據, 不過基於安全上的考量, 其對POST請求有一些特殊的限制, 就是除了跟查詢有關的表單資料外, 還要另外送出其他值: __VIEWSTATE, __EVENTVALIDATION以及__VIEWSTATEGENERATOR. 這些額外的值都可以透過GET請求空氣品質監測網得到的response中獲得.

import requests
from bs4 import BeautifulSoup


EPA_TAQM_URL = 'http://taqm.epa.gov.tw/taqm/tw/HourlyData.aspx'


def generate_query_form_data(start_date, end_date):
    resp = requests.get(EPA_TAQM_URL)
    dom = BeautifulSoup(resp.text, 'html5lib')
    view_state = dom.find(id='__VIEWSTATE')['value']
    event_validation = dom.find(id='__EVENTVALIDATION')['value']
    viewstate_generator = dom.find(id='__VIEWSTATEGENERATOR')['value']
    # In all the ctlxx$[var_name], the xx will change dynamically,
    # need to check the value before craw the web.
    # TODO: Refactor it to collect the xx value dynamically.
    form_data = {
        '__VIEWSTATE': view_state,
        '__EVENTVALIDATION': event_validation,
        '__VIEWSTATEGENERATOR': viewstate_generator,
        'ctl09$lbSite': '56',
        'ctl09$lbParam': '4',
        'ctl09$txtDateS': start_date,
        'ctl09$txtDateE': end_date,
        'ctl09$btnQuery': '查詢即時值'
    }
    return form_data


def get_web_content(start_date, end_date):
    form_data = generate_query_form_data(start_date, end_date)
    if form_data:
        resp = requests.post(EPA_TAQM_URL, data=form_data)
        dom = BeautifulSoup(resp.text, 'html5lib')
        return dom
    else:
        return None


def main():
    start_date = '2017/05/20'
    end_date = '2017/05/22'
    dom = get_web_content(start_date, end_date)
    if dom:
        for table in dom.find_all('table', 'TABLE_G'):
            print([s for s in table.stripped_strings])


if __name__ == '__main__':
    main()

輸出結果:

原始碼點我

Last updated

Was this helpful?