Source code for danoan.word_def.plugins.modules.english_collins

from danoan.word_def.core import model, exception

from danoan.dictionaries.collins.core import api as collins_api, model as collins_model

from bs4 import BeautifulSoup
from dataclasses import dataclass
import importlib
import json
import pycountry
from typing import List, Optional, TextIO
import toml


[docs] @dataclass class Configuration: entrypoint: str secret_key: str
[docs] class Adapter: def __init__(self, configuration: Configuration): self.configuration = configuration def _get_definition_api(self, word: str): return collins_api.get_best_matching( self.configuration.entrypoint, self.configuration.secret_key, collins_model.Language.English, word, collins_model.Format.JSON, ) def _get_definition_handle(self, response: collins_api.requests.Response): if response.status_code == 200: response_json = json.loads(response.text) html_data = response_json["entryContent"] html_soup = BeautifulSoup(html_data, "lxml") list_of_span_defs = html_soup.css.select(".def") list_of_definitions = [] # TODO: This could be improved. The response contains much more # tagged information than the definition but it is not that # straightforward to parse, that is why we are limiting ourselves # to take the definition. # Due to how the HTML is written, the text of the definition is # sometimes spread over more than two tags. We are limiting ourselves # to the `def` tag. We use the character count limit of 16 to avoid # situations in which we collect incomplete phrasings. for potential_definition in list_of_span_defs: content = potential_definition.contents[0].strip() if len(content) > 16: list_of_definitions.append(content) return list_of_definitions else: raise exception.UnexpectedResponseError(response.status_code, response.text)
[docs] def get_definition(self, word: str) -> List[str]: response = self._get_definition_api(word) return self._get_definition_handle(response)
@staticmethod def _to_pos_tag(pos_tag_string: str): string_to_pos_tag = { "adjective": model.PosTag.Adjective, "adverb": model.PosTag.Adverb, "auxiliary": model.PosTag.Auxiliary, "conjunction": model.PosTag.Conjunction, "determiner": model.PosTag.Determiner, "interjection": model.PosTag.Interjection, "exclamation": model.PosTag.Interjection, "noun": model.PosTag.Noun, "numeral": model.PosTag.Numeral, "particle": model.PosTag.Particle, "pronoun": model.PosTag.Pronoun, "verb": model.PosTag.Verb, } if pos_tag_string in string_to_pos_tag: return string_to_pos_tag[pos_tag_string] else: return "" def _get_pos_tag_api(self, word: str): return collins_api.get_best_matching( self.configuration.entrypoint, self.configuration.secret_key, collins_model.Language.English, word, collins_model.Format.JSON, ) def _get_pos_tag_handle(self, response: collins_api.requests.Response): if response.status_code == 200: response_json = json.loads(response.text) html_data = response_json["entryContent"] html_soup = BeautifulSoup(html_data, "lxml") list_of_span_pos = html_soup.css.select(".pos") return list( set(map(lambda x: self._to_pos_tag(x.contents[0]), list_of_span_pos)) ) else: raise exception.UnexpectedResponseError(response.status_code, response.text)
[docs] def get_pos_tag(self, word: str) -> List[model.PosTag]: response = self._get_pos_tag_api(word) return self._get_pos_tag_handle(response)
[docs] class AdapterFactory:
[docs] def version(self): return importlib.metadata.version("word-def-plugin-english-collins")
[docs] def get_language(self) -> str: return pycountry.languages.get(name="english").alpha_3
[docs] def get_adapter(self, configuration_stream: Optional[TextIO] = None) -> Adapter: if configuration_stream is None: raise exception.ConfigurationFileRequiredError() configuration = Configuration(**toml.load(configuration_stream)) return Adapter(configuration)