"""Functions related to querying articles from the apis of YLE, Helsingin Sanomat (HS), Ilta-Sanomat (IS) and Iltalehti (IL)
"""
from datetime import datetime, timedelta
from typing import AsyncIterable
import attr
from aiohttp import ClientSession
[docs]@attr.s
class Article:
"""An article
Attributes:
id (str): the unique id for the article
url (str): the url from which the article may be found
title (str): the title or headline of the article
date_modified (str): the date of last modification for the article
"""
id: str = attr.ib()
url: str = attr.ib()
title: str = attr.ib()
date_modified: str = attr.ib()
[docs]@attr.s
class Result:
"""A result from a single API call
Attributes:
articles (list[Article]): a list of the article objects returned
url (str): the URL of the API query
total (int): the total number of articles for the query. -1 if not available.
"""
articles: 'list[Article]' = attr.ib()
url: str = attr.ib()
total: int = attr.ib(default=-1)
yle_api: str = "https://yle-fi-search.api.yle.fi/v1/search"
[docs]async def query_yle(session: ClientSession, query: str, language: str, from_date: str, to_date: str, batch_size: int = 10000) -> AsyncIterable[Result]:
"""Query the YLE API for articles matching a query
Args:
session (ClientSession): the aiohttp session to use
query (str): the query string to search for
language (str): language to search (either 'fi' or 'sv')
from_date (str): date to search from (inclusive, YYYY-MM-DD)
to_date (str): date to search to (inclusive, YYYY-MM-DD)
batch_size (int, optional): How many entries to query for in a single API call. Maximum and default for the YLE API is 10000.
Raises:
ValueError: when something goes wrong in the API call
Yields:
AsyncIterable[Result]: each Result contains the results from a single API call
"""
params = {
'app_id': 'hakuylefi_v2_prod',
'app_key': '4c1422b466ee676e03c4ba9866c0921f',
'service': 'uutiset',
'language': language,
'uiLanguage': language,
'type': 'article',
'time': 'custom',
'timeFrom': from_date,
'timeTo': to_date,
'query': query,
'offset': 0,
'limit': batch_size
}
async with session.get(yle_api, params=params) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if response_json is None:
raise ValueError(f"Got empty response for {response.url}")
if response_json['meta']['count'] > 10000:
raise ValueError(
f"Query results in {response_json['meta']['count']} results. The YLE API refuses to return more than 10000 results, so refusing to continue. You can work around this limitation by doing multiple queries on smaller timespans.")
while True:
async with session.get(yle_api, params=params) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if response_json is None:
raise ValueError(f"Got empty response for {response.url}")
if len(response_json['data']) == 0: # Got 0 results, assuming we're done.
break
articles = [Article(id=a['id'], url=a['url']['full'], title=a['headline'],
date_modified=a['datePublished']) for a in response_json['data']]
yield Result(articles, str(response.url), response_json['meta']['count'])
params['offset'] += batch_size
if params['offset'] > response_json['meta']['count']: # Got all results from the API.")
break
is_api: str = "https://www.is.fi/api/search"
[docs]async def query_is(session: ClientSession, query: str, from_date: str, to_date: str, batch_size: int = 100) -> AsyncIterable[Result]:
"""Query the IS API for articles matching a query
Args:
session (ClientSession): the aiohttp session to use
query (str): the query string to search for
from_date (str): date to search from (inclusive, YYYY-MM-DD)
to_date (str): date to search to (inclusive, YYYY-MM-DD)
batch_size (int, optional): How many entries to query for in a single API call. Values supported by the IS API are 50 and 100 (which is the default).
Raises:
ValueError: when something goes wrong in the API call
Yields:
AsyncIterable[Result]: each Result contains the results from a single API call
"""
def _build_is_url(query: str, offset: int, limit: int, date_start: int, date_end: int) -> str:
return f"{is_api}/{query}/kaikki/custom/new/{offset}/{limit}/{date_start}/{date_end}"
date_start = int(datetime.timestamp(
datetime.fromisoformat(from_date)) * 1000)
date_end = int(datetime.timestamp(
datetime.fromisoformat(to_date) + timedelta(days=1)) * 1000)
async with session.get(_build_is_url(
query, 9950, 50, date_start, date_end)) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if len(response_json) != 0:
raise ValueError("Query results in more than 9950 results. The IS API refuses to return more than 10000 results, so refusing to continue. You can work around this limitation by doing multiple queries on smaller timespans.")
offset = 0
while True:
async with session.get(_build_is_url(query, offset, batch_size, date_start, date_end)) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if response_json is None:
raise ValueError(f"Got empty response for {response.url}")
if len(response_json) == 0: # Got 0 results, assuming we're done.
break
articles = [Article(id=a['id'], url='https://www.is.fi'+a['href'],
title=a['title'], date_modified=a['displayDate']) for a in response_json]
yield Result(articles, str(response.url), -1)
offset += batch_size
il_api: str = "https://api.il.fi/v1/articles/search"
[docs]async def query_il(session: ClientSession, query: str, from_date: str, to_date: str, batch_size: int = 200) -> AsyncIterable[Result]:
"""Query the IL API for articles matching a query
Args:
session (ClientSession): the aiohttp session to use
query (str): the query string to search for
from_date (str): date to search from (inclusive, YYYY-MM-DD)
to_date (str): date to search to (inclusive, YYYY-MM-DD)
batch_size (int, optional): How many entries to query for in a single API call. Maximum and default for the IL API is 200.
Raises:
ValueError: when something goes wrong in the API call
Yields:
AsyncIterable[Result]: each Result contains the results from a single API call
"""
params = {
'date_start': from_date,
'date_end': to_date,
'q': query,
'offset': 0,
'limit': batch_size
}
while True:
async with session.get(il_api, params=params) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = (await response.json())['response']
if response_json is None:
raise ValueError(f"Got empty response for {response.url}")
if len(response_json) == 0: # Got 0 results, assuming we're done.
break
articles = [Article(
id=a['article_id'],
url='http://iltalehti.fi/' +
a['category']['category_name']+"/a/"+a['article_id'],
title=a['title'],
date_modified=a['updated_at'] if a['updated_at'] is not None else a['published_at']
) for a in response_json]
yield Result(articles, str(response.url), -1)
params['offset'] += batch_size
hs_api: str = "https://www.hs.fi/api/search"
[docs]async def query_hs(session: ClientSession, query: str, from_date: str, to_date: str, batch_size: int = 100) -> AsyncIterable[Result]:
"""Query the HS API for articles matching a query
Args:
session (ClientSession): the aiohttp session to use
query (str): the query string to search for
from_date (str): date to search from (inclusive, YYYY-MM-DD)
to_date (str): date to search to (inclusive, YYYY-MM-DD)
batch_size (int, optional): How many entries to query for in a single API call. Values supported by the HS API are 50 and 100 (which is the default).
Raises:
ValueError: when something goes wrong in the API call
Yields:
AsyncIterable[Result]: each Result contains the results from a single API call
"""
def _build_hs_url(query: str, offset: int, limit: int, date_start: int, date_end: int) -> str:
return f"{hs_api}/{query}/kaikki/custom/new/{offset}/{limit}/{date_start}/{date_end}"
def _build_article_url(href: str) -> str:
if "http" in href or "www" in href:
return href
return 'https://www.hs.fi'+href
date_start = int(datetime.timestamp(
datetime.fromisoformat(from_date)) * 1000)
date_end = int(datetime.timestamp(
datetime.fromisoformat(to_date) + timedelta(days=1)) * 1000)
async with session.get(_build_hs_url(
query, 9950, 50, date_start, date_end)) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if len(response_json) != 0:
raise ValueError("Query results in more than 9950 results. The HS API refuses to return more than 10000 results, so refusing to continue. You can work around this limitation by doing multiple queries on smaller timespans.")
offset = 0
while True:
async with session.get(_build_hs_url(query, offset, batch_size, date_start, date_end)) as response:
if response.status != 200:
raise ValueError(
f"Got unexpected response code {response.status} for {response.url}.")
response_json = await response.json()
if response_json is None:
raise ValueError(f"Got empty response for {response.url}")
if len(response_json) == 0: # Got 0 results, assuming we're done.
break
articles = [Article(id=a['id'], url=_build_article_url(a['href']),
title=a['title'], date_modified=a['displayDate']) for a in response_json]
yield Result(articles, str(response.url), -1)
offset += batch_size