Source code for danoan.correct_markdown.core.utils
from bs4 import BeautifulSoup
import markdown # type: ignore
import re
from typing import List, Tuple, TextIO
[docs]
def get_plain_text_from_markdown(markdown_stream: TextIO) -> str:
"""
Removes all markdown markup from a string.
"""
html = markdown.markdown(markdown_stream.read())
soup = BeautifulSoup(html, "html.parser")
return soup.get_text()
[docs]
def extract_html_tags(html: str) -> List[Tuple[str, int, int]]:
"""
Parses a html string and extracts all its markup tags.
Each returned item is a triplet (type,start,end)
type:
closing: closing html tag
opening: opening html tag
no_html: text content
"""
tag_pattern = r"</?[^>]+>"
tag_indexes = []
if re.search(tag_pattern, html):
for m in re.finditer(tag_pattern, html):
if m.group(0).find("</") != -1:
tag_indexes.append(("closing", m.span()[0], m.span()[1]))
else:
tag_indexes.append(("opening", m.span()[0], m.span()[1]))
else:
tag_indexes.append(("no_html", 0, len(html)))
return sorted(tag_indexes, key=lambda x: x[2])
[docs]
def remove_html_tags(string_stream: TextIO) -> str:
"""
Removes all html tags from a string.
"""
content = string_stream.read()
last = 0
no_html = ""
tags = extract_html_tags(content)
for tag in tags:
name, i, j = tag
if name == "no_html":
no_html += content[i:j]
else:
no_html += content[last:i]
last = j
no_html += content[last:]
return no_html