Source code for finnish_media_scrapers.scripts.fetch_open

#!/usr/bin/env python3
"""Command-line script for fetching article HTML from openly available sources
"""

import argparse
import asyncio
import csv
import logging
import os
import random
from time import sleep

import aiohttp

logging.basicConfig(level=logging.INFO)


def _parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i', '--input', help="input CSV file containing articles to fetch (from query-il, query-is or query-yle)", required=True)
    parser.add_argument(
        '-o', '--output', help="directory to fetch articles into", required=True)
    parser.add_argument(
        '-d', '--delay', help="number of seconds to wait between consecutive requests", default=1.0, type=float)
    parser.add_argument('--quiet', default=False,
                        action='store_true', help="Log only errors")
    return parser.parse_args()


async def _amain():
    args = _parse_arguments()
    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    os.makedirs(args.output, exist_ok=True)
    with open(args.input, encoding="utf-8") as input_file:
        async with aiohttp.ClientSession() as session:
            csv_input = csv.DictReader(input_file)
            for article in csv_input:
                article_file_name = os.path.join(args.output, article['id']+".html")
                if not os.path.exists(article_file_name):
                    async with session.get(article['url']) as response:
                        content = await response.text()
                        with open(article_file_name, "w", encoding="utf-8") as article_file:
                            article_file.write(content)
                    logging.info("wrote %s into %s", article['url'], article_file_name)
                    sleep(random.randrange(args.delay*2))


[docs]def main(): asyncio.run(_amain())
if __name__ == '__main__': main()