from typing import List
import re
import os
import sys

corpus_file = sys.argv[1]
rubrique = re.sub('(\..+?)(PERL)(\/.+/)(corpus-titre-description.xml)', '\g<3>', corpus_file)
patron = sys.argv[2:]

if not os.path.exists(f'PYTHON{rubrique}'):
    os.makedirs(f'PYTHON{rubrique}')
patronFichier = "_".join(patron)

print(f'./PYTHON{rubrique}PATRON_{patronFichier}.txt')
f = open(f'./PYTHON{rubrique}PATRON_{patronFichier}.txt','w',encoding='utf-8')

def extract(corpus_file: str, patron: List[str]):
    buf = [("---", "---")] * len(patron)
    with open(corpus_file) as corpus:
        for line in corpus:
            buf.pop(0)
            match = re.match(' <element><data type="type">([^<]+?)</data><data type="lemma">[^<]+?</data><data type="string">([^<]+?)</data></element>', line) 
            if match:
                tag = match.group(1)
                forme = match.group(2)
                buf.append((tag,forme))
            else:
                buf = [("---", "---")] * len(patron)
            ok = True
            terme = ""
            for i, gat in enumerate(patron):
                if gat == buf[i][0]:
                    terme = terme + buf[i][1] + f"/{gat} "
                else:
                    ok = False
            if ok:
                f.write(f'{terme}\n')

extract(corpus_file, patron)

