Source code for finnish_media_scrapers.scripts.query_il

#!/usr/bin/env python3
"""Command-line script for querying Iltalehti
"""

import argparse
import asyncio
import csv
import logging
import random
from datetime import datetime
from time import sleep

import aiohttp

from ..query import query_il

logging.basicConfig(level=logging.INFO)


def _parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--from-date',
                        help="from date (inclusive, YYYY-MM-DD)", required=True)
    parser.add_argument(
        '-t', '--to-date', help="to date (inclusive, YYYY-MM-DD, defaults to today)",
        default=datetime.today().strftime('%Y-%m-%d'))
    parser.add_argument(
        '-q', '--query', help="query string to search for", required=True)
    parser.add_argument(
        '-o', '--output', help="output CSV file", required=True)
    parser.add_argument(
        '-l', '--limit', help="number of articles to fetch per query (max==200)", default=200, type=int)
    parser.add_argument(
        '-d', '--delay', help="number of seconds to wait between consecutive requests", default=1.0, type=float)
    parser.add_argument('--quiet', default=False,
                        action='store_true', help="Log only errors")
    return parser.parse_args()


async def _amain():
    args = _parse_arguments()

    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    with open(args.output, "w") as output_file:
        csv_output = csv.writer(output_file)
        csv_output.writerow(['id', 'url', 'title', 'date_modified'])
        total_count = 0
        async with aiohttp.ClientSession() as session:
            async for response in query_il(session, args.query, args.from_date, args.to_date, args.limit):
                total_count += len(response.articles)
                logging.info(
                    "Processing %d articles from %s. In total fetched %d articles.",
                    len(response.articles), response.url, total_count)
                for article in response.articles:
                    csv_output.writerow([article.id, article.url,
                                        article.title, article.date_modified])
                sleep(random.randrange(args.delay*2))
        logging.info("Processed %s articles in total.", total_count)


[docs]def main(): asyncio.run(_amain())
if __name__ == '__main__': main()