Source code for finnish_media_scrapers.htmltotext
"""Functions to extract article plain texts from the YLE/HS/IL/IS HTML articles
"""
import re
from typing import TextIO, Union
from bs4 import BeautifulSoup, NavigableString
[docs]def extract_text_from_svyle_html(html: Union[str, TextIO]) -> str:
"""Extract article text from Svenska YLE article HTML
Args:
html (Union[str,TextIO]): a string or a file-like object containing the article HTML
Raises:
ValueError: The layout of the article was not recognized, or the article parsed as empty
Returns:
str: article text
"""
soup = BeautifulSoup(html, 'lxml')
elem = soup.select_one('article#main-content')
if elem is None:
raise ValueError("Article layout not recognized")
for elem_to_remove in soup.select('aside#id-article__tags'):
elem_to_remove.extract()
for elem_to_remove in soup.select('#comments'):
elem_to_remove.extract()
for elem_to_remove in soup.select('.ydd-share-buttons'):
elem_to_remove.extract()
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
for block_elem in elem.find_all(tag):
block_elem.insert_after(NavigableString('\n\n'))
txt = elem.get_text().strip()
if txt == "":
raise ValueError("Parsing results in an empty article")
return txt
[docs]def extract_text_from_yle_html(html: Union[str, TextIO]) -> str:
"""Extract article text from YLE article HTML
Args:
html (Union[str,TextIO]): a string or a file-like object containing the article HTML
Raises:
ValueError: The layout of the article was not recognized, or the article parsed as empty
Returns:
str: article text
"""
soup = BeautifulSoup(html, 'lxml')
elem = soup.select_one('.yle__article')
if elem is None:
elem = soup.select_one('#yle__section--article')
if elem is None:
elem = soup.select_one('article.content')
if elem is None:
raise ValueError("Article layout not recognized")
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
for block_elem in elem.find_all(tag):
block_elem.insert_after(NavigableString('\n\n'))
txt = elem.get_text().strip()
if txt == "":
raise ValueError("Parsing results in an empty article")
return txt
[docs]def extract_text_from_is_html(html: Union[str, TextIO]) -> str:
"""Extract article text from Ilta-Sanomat article HTML
Args:
html (Union[str,TextIO]): a string or a file-like object containing the article HTML
Raises:
ValueError: The layout of the article was not recognized, or the article parsed as empty
Returns:
str: article text
"""
soup = BeautifulSoup(html, 'lxml')
elem = soup.select_one(
'article.single-article,article.article--m,article.article--l,article.article--xl-picture-top,article.article--xl-title-top')
if elem is None:
raise ValueError("Article layout not recognized")
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
for block_elem in elem.find_all(tag):
block_elem.insert_after(NavigableString('\n\n'))
txt = elem.get_text().strip()
if txt == "":
raise ValueError("Parsing results in an empty article")
return txt
[docs]def extract_text_from_il_html(html: Union[str, TextIO]) -> str:
"""Extract article text from Iltalehti article HTML
Args:
html (Union[str,TextIO]): a string or a file-like object containing the article HTML
Raises:
ValueError: The layout of the article was not recognized, or the article parsed as empty
Returns:
str: article text
"""
soup = BeautifulSoup(html, 'lxml')
soup = soup.select_one('.article-content')
if soup is None:
raise ValueError("Article layout not recognized")
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
for block_elem in soup.find_all(tag):
block_elem.insert_after(NavigableString('\n\n'))
txt = soup.get_text().strip()
if txt == "":
raise ValueError("Parsing results in an empty article")
return txt
[docs]def extract_text_from_hs_html(html: Union[str, TextIO]) -> str:
"""Extract article text from Helsingin Sanomat article HTML
Args:
html (Union[str,TextIO]): a string or a file-like object containing the article HTML
Raises:
ValueError: The layout of the article was not recognized, or the article parsed as empty
Returns:
str: article text
"""
soup = BeautifulSoup(html, 'lxml')
elem = soup.select_one('#__nuxt,article.article--xxl')
if elem is not None:
soup = elem
else:
elem = soup.find('main')
if elem is not None:
soup = elem
elem = soup.select_one('div#page-main-content + article')
if elem is not None:
soup = elem
else:
elem = soup.select_one('div#page-main-content,#paid-content')
if elem is not None:
soup = elem
else:
raise ValueError("Article layout not recognized")
for elem in soup.find_all('aside'):
elem.extract()
for elem in soup.select('section.article-body + div'):
elem.extract()
for elem_to_remove in soup.select('div.article-info'):
elem_to_remove.extract()
for elem_to_remove in soup.select('div.related-articles'):
elem_to_remove.extract()
for elem_to_remove in soup.select('div.article-actions'):
elem_to_remove.extract()
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
for block_elem in soup.find_all(tag):
block_elem.insert_after(NavigableString('\n\n'))
txt = soup.get_text()
txt = txt.replace("\xad", "")
txt = re.sub("\n\n+", "\n\n", txt)
txt = txt.strip()
if txt == "":
raise ValueError("Parsing results in an empty article")
return txt