Source code for finnish_media_scrapers.scripts.fetch_hs

#!/usr/bin/env python3
"""Command-line script for fetching article HTML from Helsingin Sanomat
"""
# %%

import argparse
import asyncio
import csv
import logging
import os
import random
from time import sleep

from pyppeteer import launch

from ..fetch import fetch_article_hs, prepare_session_hs

logging.basicConfig(level=logging.INFO)
# %%


def _parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i', '--input', help="input CSV file containing articles to fetch (from query-hs.py)", required=True)
    parser.add_argument(
        '-o', '--output', help="directory to fetch articles into", required=True)
    parser.add_argument(
        '-u', '--username', help="email to use for article fetching", required=True)
    parser.add_argument(
        '-p', '--password', help="password to use for article fetching", required=True)
    parser.add_argument('-mw', '--max-web-driver-wait',
                        help="maximum time in seconds to wait for the webdriver to render a page before failing (default 30)", default=30, type=int)
    parser.add_argument(
        '-d', '--delay', help="number of seconds to wait between consecutive requests (default 1.0)", default=1.0, type=float)
    parser.add_argument('--quiet', default=False,
                        action='store_true', help="Log only errors")
    return parser.parse_args()


async def _amain():
    args = _parse_arguments()
    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    browser = await launch()
    try:
        session = await browser.newPage()
        try:
            logging.info("Logging into HS.")
            await prepare_session_hs(session, args.username,
                                     args.password, args.max_web_driver_wait)
            logging.info("Logged in.")
            os.makedirs(args.output, exist_ok=True)
            with open(args.input, encoding="utf-8") as input_file:
                csv_input = csv.DictReader(input_file)
                for article in csv_input:
                    url = article['url']
                    file = os.path.join(args.output, str(article['id'])+".html")
                    if not os.path.exists(file):
                        try:
                            article = await fetch_article_hs(
                                session, url, args.max_web_driver_wait)
                        except Exception as e:
                            logging.info(e)
                            continue
                        with open(file, "w", encoding="utf-8") as article_file:
                            article_file.write(
                                "<!DOCTYPE html><head><meta charset='utf-8'></head>" + article + "</html>")
                        logging.info("Wrote article %s into %s", url, file)
                        sleep(random.randrange(args.delay*2))
                    else:
                        logging.info("Skipping %s as %s already exists.", url, file)
        finally:
            await session.close()
    finally:
        await browser.close()


[docs]def main():
    asyncio.run(_amain())


if __name__ == '__main__':
    main()