diff --git a/pwgen_fr/morphalou.py b/pwgen_fr/morphalou.py new file mode 100644 index 0000000..abaacec --- /dev/null +++ b/pwgen_fr/morphalou.py @@ -0,0 +1,218 @@ +""" Reads the Morphalou dataset, in its TSV form """ + +import typing as t +from lxml import etree +from pathlib import Path +import itertools + +from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb + +TSV_NS = { + "tsv": "http://www.tei-c.org/ns/1.0", + "xml": "http://www.w3.org/XML/1998/namespace", +} + + +class MorphalouSet: + MORPHALOU_DIR_PATH = ( + Path(__file__).parent.parent + / "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI" + ) + MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml" + + CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = { + Nom: "commonNoun", + Adjectif: "adjective", + Verbe: "verb", + Adverbe: "adverb", + } + + word_db: WordDb + + def __init__(self): + self.word_db = WordDb() + + def parse(self): + """Parses the dataset""" + for cat, cat_file in self.__class__.CAT_MAPPING.items(): + word_db_elt = WordDb.CATEGORY_TO_ATTR[cat] + setattr( + self.word_db, + word_db_elt, + getattr(self, f"_parse_{word_db_elt}")( + self.__class__.MORPHALOU_DIR_PATH + / self.__class__.MORPHALOU_FILENAME_TEMPLATE.format( + cat_name=cat_file + ) + ), + ) + + def _tsv_elems(self, tsv_path: Path): + """Opens a TSV file, and returns the node, direct parent of all the + relevant nodes""" + with tsv_path.open("r") as h: + tree = etree.parse(h) + root = tree.getroot() + body = root.find("./tsv:text/tsv:body", TSV_NS) + return body + + def _parse_noms(self, tsv_path: Path) -> list[Nom]: + """Parse the nouns""" + root = self._tsv_elems(tsv_path) + out: list[Nom] = [] + + for entry in root.iterfind("./tsv:entry", TSV_NS): + try: + genre = self._genre( + entry.find( + "./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS + ).text + ) + except AttributeError: + continue # some nouns don't have a gender defined, somehow -- ignore + + forms = {} + for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): + orth = inflected.find("./tsv:orth", TSV_NS).text + nombres = self._nombre_set( + inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text + ) + for form in nombres: + forms[form] = orth + try: + out.append( + Nom( + genre=genre, + sing=forms[Nombre.SING], + plur=forms[Nombre.PLUR], + ) + ) + except KeyError: + continue # cannot be inflected to all required forms: skip + + return out + + def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]: + """Parse the adjectives""" + root = self._tsv_elems(tsv_path) + out: list[Adjectif] = [] + + for entry in root.iterfind("./tsv:entry", TSV_NS): + forms = {} + for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): + orth = inflected.find("./tsv:orth", TSV_NS).text + gram_grp = inflected.find("./tsv:gramGrp", TSV_NS) + genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text) + nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text) + + for form in itertools.product(genres, nombres): + forms[form] = orth + try: + out.append( + Adjectif( + masc_sing=forms[Genre.MASC, Nombre.SING], + masc_plur=forms[Genre.MASC, Nombre.PLUR], + fem_sing=forms[Genre.FEM, Nombre.SING], + fem_plur=forms[Genre.FEM, Nombre.PLUR], + ) + ) + except KeyError: + continue # cannot be inflected to all required forms: skip + + return out + + def _parse_verbes(self, tsv_path: Path) -> list[Verbe]: + """Parse the verbs""" + root = self._tsv_elems(tsv_path) + out: list[Verbe] = [] + + for entry in root.iterfind("./tsv:entry", TSV_NS): + forms = {} + for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS): + gram_grp = inflected.find("./tsv:gramGrp", TSV_NS) + + # Order of tests is important! If mood == 'participle', there is no + # 'person' defined. + if ( + gram_grp.find("./tsv:mood", TSV_NS).text != "indicative" + or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson" + ): + continue # irrelevant for us + + temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text) + if temps is None: + continue # irrelevant for us + + nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text) + + orth = inflected.find("./tsv:orth", TSV_NS).text + for nombre in nombres: + forms[(temps, nombre)] = orth + try: + out.append( + Verbe( + present_sing=forms[Temps.PRESENT, Nombre.SING], + present_plur=forms[Temps.PRESENT, Nombre.PLUR], + futur_sing=forms[Temps.FUTUR, Nombre.SING], + futur_plur=forms[Temps.FUTUR, Nombre.PLUR], + imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING], + imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR], + ) + ) + except KeyError: + continue # cannot be inflected to all required forms: skip + + return out + + def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]: + """Parse the adverbs""" + root = self._tsv_elems(tsv_path) + out: list[Adverbe] = [] + + for entry in root.iterfind("./tsv:entry", TSV_NS): + # We're only interested in the lemma form + orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS) + assert orth is not None + adv = orth.text + out.append(Adverbe(adv=adv)) + + return out + + @staticmethod + def _genre_set(genre: str) -> list[Genre]: + return { + "masculine": [Genre.MASC], + "feminine": [Genre.FEM], + "invariable": [Genre.MASC, Genre.FEM], + }[genre] + + @staticmethod + def _genre(genre: str) -> Genre: + return { + "masculine": Genre.MASC, + "feminine": Genre.FEM, + "invariable": Genre.INV, + }[genre] + + @staticmethod + def _nombre(nombre: str) -> Nombre: + return { + "singular": Nombre.SING, + "plural": Nombre.PLUR, + }[nombre] + + @staticmethod + def _nombre_set(nombre: str) -> list[Nombre]: + return { + "singular": [Nombre.SING], + "plural": [Nombre.PLUR], + "invariable": [Nombre.SING, Nombre.PLUR], + }[nombre] + + @staticmethod + def _tense(tense: str) -> t.Optional[Temps]: + return { + "present": Temps.PRESENT, + "imperfect": Temps.IMPARFAIT, + "future": Temps.FUTUR, + }.get(tense, None) diff --git a/pwgen_fr/word_db.py b/pwgen_fr/word_db.py index e46dbff..3435997 100644 --- a/pwgen_fr/word_db.py +++ b/pwgen_fr/word_db.py @@ -8,6 +8,7 @@ import json class Genre(Enum): MASC = "masculin" FEM = "féminin" + INV = "invariable" # pour les noms uniquement class Nombre(Enum): @@ -89,6 +90,13 @@ class WordDb: "adverbes": Adverbe, } + CATEGORY_TO_ATTR: dict = { + Nom: "noms", + Adjectif: "adjectifs", + Verbe: "verbes", + Adverbe: "adverbes", + } + noms: list[Nom] adjectifs: list[Adjectif] verbes: list[Verbe]