Source code for finnish_media_scrapers.fetch

"""Utilities for fetching articles.
Currently only affects Helsingin Sanomat. Scraping of the other sources can be done just using requests,
but HS needs a user to be logged in, as well as renders their articles using dynamic javascript,
thus requiring a Selenium session to enable fetching the articles.
"""

from pyppeteer.browser import Page
from pyppeteer.errors import NetworkError

[docs]async def prepare_session_hs( session: Page, username: str, password: str, max_web_driver_wait: int = 30): """Prepare a pyppeteer session for scraping articles from Helsingin Sanomat by logging in using the provided user id and password. Raises: TimeoutError: if the web driver is unable to find the elements it is looking for in 30 seconds. May indicate changes to the loging page structure. Args: session (Page): the pyppeteer session to use username (str): the username to log in as password (str): the password to use for logging in max_web_driver_wait (int): the maximum number of seconds to wait for the webdriver to render a page before failing (default: 30) """ max_web_driver_wait = 1000 * max_web_driver_wait await session.goto("https://www.hs.fi", timeout=max_web_driver_wait) cookies_frame = await session.waitForXPath("//iframe[@title='SP Consent Message']", timeout=max_web_driver_wait) frame = await cookies_frame.contentFrame() ok_button = await frame.waitForXPath("//button[@title='OK']", timeout=max_web_driver_wait) await ok_button.click() login = await session.waitForXPath("//*[contains(text(), 'Kirjaudu')]", timeout=max_web_driver_wait) await login.click() user = await session.waitForSelector("#username", timeout=max_web_driver_wait) await user.type(username) passw = await session.waitForSelector("#password", timeout=max_web_driver_wait) await passw.type(password) submit = await session.waitForSelector("button[type=submit]", timeout=max_web_driver_wait) await submit.click() await session.waitForNavigation()
[docs]async def fetch_article_hs( session: Page, url: str, max_web_driver_wait: int = 30) -> str: """Fetch the HTML of a single article using a pyppeteer session where prepare_session_hs has been called before. Args: session (Page): the pyppeteer session to use url (str): the HS article URL to fetch article content from max_web_driver_wait (int): the maximum number of seconds to wait for the webdriver to render a page before failing (default: 30) Raises: ValueError: If parsing the article fails, probably due to encountering a prevously unknown layout Returns: str: the HTML of the article """ max_web_driver_wait = 1000 * max_web_driver_wait try: response = await session.goto(url, timeout=max_web_driver_wait) if response.status == 404: raise ValueError( f"The page doesn't exist for {url}." ) except NetworkError as network_exception: raise ValueError( f"The page doesn't exist for {url}." ) from network_exception try: main_content = await session.waitForXPath("//div[@id='page-main-content']/following-sibling::*", timeout=max_web_driver_wait) tag_name = await (await main_content.getProperty('tagName')).jsonValue() if tag_name == 'IFRAME': content = await main_content.contentFrame() if len(await session.xpath("//div[@class='paywall-container']")) != 0: await session.waitForXPath("//div[@class='paywall-content']|//div[@id='paid-content']") else: content = session except TimeoutError as timeout_exception: raise ValueError( f"Couldn't find the dynamic content I was looking for in {url}. There may be a class of HS articles we're not yet handling." ) from timeout_exception article = await content.content() return article