Source code for finnish_media_scrapers.htmltotext

"""Functions to extract article plain texts from the YLE/HS/IL/IS HTML articles
"""
import re
from typing import TextIO, Union

from bs4 import BeautifulSoup, NavigableString


[docs]def extract_text_from_svyle_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Svenska YLE article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one('article#main-content')
    if elem is None:
        raise ValueError("Article layout not recognized")
    for elem_to_remove in soup.select('aside#id-article__tags'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('#comments'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('.ydd-share-buttons'):
        elem_to_remove.extract()

    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in elem.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = elem.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt


[docs]def extract_text_from_yle_html(html: Union[str, TextIO]) -> str:
    """Extract article text from YLE article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one('.yle__article')
    if elem is None:
        elem = soup.select_one('#yle__section--article')
    if elem is None:
        elem = soup.select_one('article.content')
    if elem is None:
        raise ValueError("Article layout not recognized")
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in elem.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = elem.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt


[docs]def extract_text_from_is_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Ilta-Sanomat article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one(
        'article.single-article,article.article--m,article.article--l,article.article--xl-picture-top,article.article--xl-title-top')
    if elem is None:
        raise ValueError("Article layout not recognized")
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in elem.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = elem.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt


[docs]def extract_text_from_il_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Iltalehti article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    soup = soup.select_one('.article-content')
    if soup is None:
        raise ValueError("Article layout not recognized")
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in soup.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = soup.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt


[docs]def extract_text_from_hs_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Helsingin Sanomat article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one('#__nuxt,article.article--xxl')
    if elem is not None:
        soup = elem
    else:
        elem = soup.find('main')
        if elem is not None:
            soup = elem
        elem = soup.select_one('div#page-main-content + article')
        if elem is not None:
            soup = elem
        else:
            elem = soup.select_one('div#page-main-content,#paid-content')
            if elem is not None:
                soup = elem
            else:
                raise ValueError("Article layout not recognized")
    for elem in soup.find_all('aside'):
        elem.extract()
    for elem in soup.select('section.article-body + div'):
        elem.extract()
    for elem_to_remove in soup.select('div.article-info'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('div.related-articles'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('div.article-actions'):
        elem_to_remove.extract()
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in soup.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = soup.get_text()
    txt = txt.replace("\xad", "")
    txt = re.sub("\n\n+", "\n\n", txt)
    txt = txt.strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt