5.2 儲存成CSV檔案

這隻爬蟲會去ezprice上根據指定的商品字眼搜集商品資訊, 並且將爬到的資訊儲存至csv檔案裡.

import requests
import urllib.parse
import csv
import os
from bs4 import BeautifulSoup


EZPRICE_URL = 'https://ezprice.com.tw'
CSV_FILE_NAME = 'ezprice.csv'


def get_web_content(url):
    resp = requests.get(url)
    if resp.status_code != 200:
        print('Invalid url: ' + resp.url)
        return None
    else:
        return resp.text


def get_price_info(query, page):
    encoded_query = urllib.parse.quote(query)
    doms = list()
    for page in range(1, page + 1):
        url = EZPRICE_URL + '/s/%s/price/?q=%s&p=%s' % (encoded_query, encoded_query, str(page))
        result_page = get_web_content(url)
        doms.append(BeautifulSoup(result_page, 'html5lib'))
    return doms


def extract_results(dom):
    items = list()
    for div in dom.find_all('div', 'search-rst clearfix'):
        item = list()
        item.append(div.h4.a['title'])
        item.append(div.find(itemprop='price')['content'])
        if div.find('span', 'platform-name'):
            item.append(div.find('span', 'platform-name').text.strip())
        else:
            item.append('N/A')
        items.append(item)
    return items, len(items)


def show_results(items):
    for item in items:
        print(item)


def write_to_csv_file(is_first_page, items):
    with open(CSV_FILE_NAME, 'a', encoding='UTF-8', newline='') as file:
        writer = csv.writer(file)
        if is_first_page:
            writer.writerow(('Item', 'Price', 'Store'))
        for item in items:
            writer.writerow((column for column in item))


def read_from_csv_file():
    print('\nRead from csv file: ' + CSV_FILE_NAME)
    with open(CSV_FILE_NAME, 'r', encoding='UTF-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            print(row['Item'], row['Price'], row['Store'])


def main():
    query = '吉胖喵'
    page = 5
    doms = get_price_info(query, page)
    is_first_page = True
    total_item_count = 0
    for dom in doms:
        items, count = extract_results(dom)
        total_item_count += count
        show_results(items)
        write_to_csv_file(is_first_page, items)
        is_first_page = False
    print('There are %s items in %d page(s).' % (total_item_count, page))
    read_from_csv_file()
    # Uncomment this if you don't want to keep the data in csv file.
    # os.remove(CSV_FILE_NAME)


if __name__ == '__main__':
    main()

輸出結果:

CSV檔案內容:

原始碼點我

Last updated

Was this helpful?