#MODE D'EMPLOI : python3 extract_rel_dep.py ./fichiersUdpipeXML/RUBRIQUE/titre-description-udpipe.xml relation

import re
import sys
import os
from pathlib import Path

fic = sys.argv[1]
relation = sys.argv[2]
rubrique = re.sub('\.\/fichiersUdpipeXML/(.+)?/titre-description-udpipe.xml','\g<1>',fic)

if not os.path.exists(f'PYTHON/{rubrique}'):
    os.makedirs(f'PYTHON/{rubrique}')
print(f'PYTHON/{rubrique}/RELATION_{relation}.txt')
f = open(f'PYTHON/{rubrique}/RELATION_{relation}.txt','w',encoding='utf-8')
sent_buf = {} 
obj_buf = []
couples = set()
for line in Path(fic).read_text().split("\n"):
    if line.startswith("<item>"):
        fields = re.findall("<a>([^<]+)</a>", line)
        idx, word, lemma, tag, _, _, head, rel, _, _ = fields
        sent_buf[idx] = lemma
        if rel == relation:
            obj_buf.append((lemma, head))
    if line == "</p>":
        for obj_lemma, head in obj_buf:
            #print(sent_buf[head], "--[obj]-->", obj_lemma)
            couples.add((f"{sent_buf[head]}", f"{obj_lemma}"))
        obj_buf = []
        sent_buf = {}

for src, tgt in couples:
    f.write(f"{src} -[{relation}]-> {tgt}\n")

