""" Reads the Morphalou dataset, in its TSV form """ import typing as t from lxml import etree from pathlib import Path import itertools from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb TSV_NS = { "tsv": "http://www.tei-c.org/ns/1.0", "xml": "http://www.w3.org/XML/1998/namespace", } class MorphalouSet: MORPHALOU_DIR_PATH = ( Path(__file__).parent.parent / "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI" ) MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml" CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = { Nom: "commonNoun", Adjectif: "adjective", Verbe: "verb", Adverbe: "adverb", } word_db: WordDb def __init__(self): self.word_db = WordDb() def parse(self): """Parses the dataset""" for cat, cat_file in self.__class__.CAT_MAPPING.items(): word_db_elt = WordDb.CATEGORY_TO_ATTR[cat] setattr( self.word_db, word_db_elt, getattr(self, f"_parse_{word_db_elt}")( self.__class__.MORPHALOU_DIR_PATH / self.__class__.MORPHALOU_FILENAME_TEMPLATE.format( cat_name=cat_file ) ), ) def _tsv_elems(self, tsv_path: Path): """Opens a TSV file, and returns the node, direct parent of all the relevant nodes""" with tsv_path.open("r") as h: tree = etree.parse(h) root = tree.getroot() body = root.find("./tsv:text/tsv:body", TSV_NS) return body def _parse_noms(self, tsv_path: Path) -> list[Nom]: """Parse the nouns""" root = self._tsv_elems(tsv_path) out: list[Nom] = [] for entry in root.iterfind("./tsv:entry", TSV_NS): try: genre = self._genre( entry.find( "./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS ).text ) except AttributeError: continue # some nouns don't have a gender defined, somehow -- ignore forms = {} for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): orth = inflected.find("./tsv:orth", TSV_NS).text nombres = self._nombre_set( inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text ) for form in nombres: forms[form] = orth try: out.append( Nom( genre=genre, sing=forms[Nombre.SING], plur=forms[Nombre.PLUR], ) ) except KeyError: continue # cannot be inflected to all required forms: skip return out def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]: """Parse the adjectives""" root = self._tsv_elems(tsv_path) out: list[Adjectif] = [] for entry in root.iterfind("./tsv:entry", TSV_NS): forms = {} for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): orth = inflected.find("./tsv:orth", TSV_NS).text gram_grp = inflected.find("./tsv:gramGrp", TSV_NS) genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text) nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text) for form in itertools.product(genres, nombres): forms[form] = orth try: out.append( Adjectif( masc_sing=forms[Genre.MASC, Nombre.SING], masc_plur=forms[Genre.MASC, Nombre.PLUR], fem_sing=forms[Genre.FEM, Nombre.SING], fem_plur=forms[Genre.FEM, Nombre.PLUR], ) ) except KeyError: continue # cannot be inflected to all required forms: skip return out def _parse_verbes(self, tsv_path: Path) -> list[Verbe]: """Parse the verbs""" root = self._tsv_elems(tsv_path) out: list[Verbe] = [] for entry in root.iterfind("./tsv:entry", TSV_NS): forms = {} for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): gram_grp = inflected.find("./tsv:gramGrp", TSV_NS) # Order of tests is important! If mood == 'participle', there is no # 'person' defined. if ( gram_grp.find("./tsv:mood", TSV_NS).text != "indicative" or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson" ): continue # irrelevant for us temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text) if temps is None: continue # irrelevant for us nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text) orth = inflected.find("./tsv:orth", TSV_NS).text for nombre in nombres: forms[(temps, nombre)] = orth try: out.append( Verbe( present_sing=forms[Temps.PRESENT, Nombre.SING], present_plur=forms[Temps.PRESENT, Nombre.PLUR], futur_sing=forms[Temps.FUTUR, Nombre.SING], futur_plur=forms[Temps.FUTUR, Nombre.PLUR], imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING], imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR], ) ) except KeyError: continue # cannot be inflected to all required forms: skip return out def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]: """Parse the adverbs""" root = self._tsv_elems(tsv_path) out: list[Adverbe] = [] for entry in root.iterfind("./tsv:entry", TSV_NS): # We're only interested in the lemma form orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS) assert orth is not None adv = orth.text out.append(Adverbe(adv=adv)) return out @staticmethod def _genre_set(genre: str) -> list[Genre]: return { "masculine": [Genre.MASC], "feminine": [Genre.FEM], "invariable": [Genre.MASC, Genre.FEM], }[genre] @staticmethod def _genre(genre: str) -> Genre: return { "masculine": Genre.MASC, "feminine": Genre.FEM, "invariable": Genre.INV, }[genre] @staticmethod def _nombre(nombre: str) -> Nombre: return { "singular": Nombre.SING, "plural": Nombre.PLUR, }[nombre] @staticmethod def _nombre_set(nombre: str) -> list[Nombre]: return { "singular": [Nombre.SING], "plural": [Nombre.PLUR], "invariable": [Nombre.SING, Nombre.PLUR], }[nombre] @staticmethod def _tense(tense: str) -> t.Optional[Temps]: return { "present": Temps.PRESENT, "imperfect": Temps.IMPARFAIT, "future": Temps.FUTUR, }.get(tense, None)