Source code for danoan.correct_markdown.core.utils

from bs4 import BeautifulSoup
import markdown  # type: ignore
import re
from typing import List, Tuple, TextIO


[docs] def get_plain_text_from_markdown(markdown_stream: TextIO) -> str: """ Removes all markdown markup from a string. """ html = markdown.markdown(markdown_stream.read()) soup = BeautifulSoup(html, "html.parser") return soup.get_text()
[docs] def extract_html_tags(html: str) -> List[Tuple[str, int, int]]: """ Parses a html string and extracts all its markup tags. Each returned item is a triplet (type,start,end) type: closing: closing html tag opening: opening html tag no_html: text content """ tag_pattern = r"</?[^>]+>" tag_indexes = [] if re.search(tag_pattern, html): for m in re.finditer(tag_pattern, html): if m.group(0).find("</") != -1: tag_indexes.append(("closing", m.span()[0], m.span()[1])) else: tag_indexes.append(("opening", m.span()[0], m.span()[1])) else: tag_indexes.append(("no_html", 0, len(html))) return sorted(tag_indexes, key=lambda x: x[2])
[docs] def remove_html_tags(string_stream: TextIO) -> str: """ Removes all html tags from a string. """ content = string_stream.read() last = 0 no_html = "" tags = extract_html_tags(content) for tag in tags: name, i, j = tag if name == "no_html": no_html += content[i:j] else: no_html += content[last:i] last = j no_html += content[last:] return no_html