8.1 空氣品質監測網

這個網站可以讓我們根據選擇的地區看到空氣品質數據, 不過基於安全上的考量, 其對POST請求有一些特殊的限制, 就是除了跟查詢有關的表單資料外, 還要另外送出其他值: __VIEWSTATE, __EVENTVALIDATION以及__VIEWSTATEGENERATOR. 這些額外的值都可以透過GET請求空氣品質監測網得到的response中獲得.

import requests
from bs4 import BeautifulSoup


EPA_TAQM_URL = 'http://taqm.epa.gov.tw/taqm/tw/HourlyData.aspx'


def generate_query_form_data(start_date, end_date):
    resp = requests.get(EPA_TAQM_URL)
    dom = BeautifulSoup(resp.text, 'html5lib')
    view_state = dom.find(id='__VIEWSTATE')['value']
    event_validation = dom.find(id='__EVENTVALIDATION')['value']
    viewstate_generator = dom.find(id='__VIEWSTATEGENERATOR')['value']
    # In all the ctlxx$[var_name], the xx will change dynamically,
    # need to check the value before craw the web.
    # TODO: Refactor it to collect the xx value dynamically.
    form_data = {
        '__VIEWSTATE': view_state,
        '__EVENTVALIDATION': event_validation,
        '__VIEWSTATEGENERATOR': viewstate_generator,
        'ctl09$lbSite': '56',
        'ctl09$lbParam': '4',
        'ctl09$txtDateS': start_date,
        'ctl09$txtDateE': end_date,
        'ctl09$btnQuery': '查詢即時值'
    }
    return form_data


def get_web_content(start_date, end_date):
    form_data = generate_query_form_data(start_date, end_date)
    if form_data:
        resp = requests.post(EPA_TAQM_URL, data=form_data)
        dom = BeautifulSoup(resp.text, 'html5lib')
        return dom
    else:
        return None


def main():
    start_date = '2017/05/20'
    end_date = '2017/05/22'
    dom = get_web_content(start_date, end_date)
    if dom:
        for table in dom.find_all('table', 'TABLE_G'):
            print([s for s in table.stripped_strings])


if __name__ == '__main__':
    main()

輸出結果:

['日期', '00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '05/20', '34', '32', '38', '43', '40', '41', '43', '53', '55', '59', '65', '51', '36', '15', '25', '28', '40', '43', '36', '33', '27', '35', '42', '47', '05/21', '51', '54', '57', '59', '60', '64', '71', '66', '68', '59', '54', '35', '35', '39', '62', '59', '53', '37', '36', '35', '31', '34', '32', '36', '05/22', '30', '39', '37', '42', '39', '35', '36', '38', '45', '41', '35', '30', '43', '43', '45', '28', '22', '24', '27', '33', '30', '30', '21', '17']

Process finished with exit code 0

原始碼點我

Previous8. 處理POST請求/登入頁面 Next9. 動態網頁爬蟲

Last updated 5 years ago

Was this helpful?