#!/usr/bin/env python3
# -*- coding = utf-8 -*-

import os
import re
import shutil
from bs4 import BeautifulSoup

cata = ''
shutil.rmtree('SORTIE')

for root, dirs, files in os.walk('CORPUS'):
	#copie arborescence, cf fonctionnement generator
	path = root.replace('CORPUS', 'SORTIE')
	os.mkdir(path)
	
	#parcourir fichiers
	for name in files:
		#préparation écriture
		corpustxt = ''
		corpusxml = ''
		
		if name.endswith('.xml'):
		###à faire : contrôle doublon
		
			filename = os.path.join(root, name)
			patt = re.compile('encoding=[\'\"a-zA-Z0-9-]*')
			print(filename)
			
			#extraction encodage
			with open(filename, 'r') as fichier:
				tmp = fichier.readline()
				encocpl = patt.search(tmp)
				a,b = encocpl.span()
				encolist = tmp[a:b].split('=')
				enco = encolist[1]
				
			with open(filename, 'r', encoding = enco) as fichier:
				soup = BeautifulSoup(fichier, 'lxml')
				list_item = soup.find_all('item')
				counter = 1	#numérotation dans fichier xml
				
				for i in list_item:	#extraction info
					name_title_xml = i.title
					desc_xml = i.description
					
					if str(name_title_xml) not in cata :
					#suppression doublons
					
						#traitement cas particulier <description/>, cf 2016 env_sciences & fil1514829634-v1
						#xml
						if name_title_xml :	
							corpusxml += '<archive = ' + str(counter) + '>' + repr(name_title_xml) + "\n"
							cata += str(name_title_xml)
						if desc_xml :
							corpusxml += repr(desc_xml) + "\n</archive>\n"
						else:
							corpusxml += "\n</archive>\n"

						#txt
						if name_title_xml and name_title_xml.string :
							corpustxt += name_title_xml.string + "\n"
						if desc_xml and desc_xml.string :
							corpustxt += desc_xml.string + "\n\n"
						
						counter += 1

			#écriture fichier xml, redirection dans nvl arborescence
			xmlpath = filename.replace('CORPUS', 'SORTIE')
			with open(xmlpath, 'w', encoding = 'utf-8') as xmltmp:
				xmltmp.write('<?xml version="1.0" encoding="utf-8"?>\n<file>\n')
				xmltmp.write(corpusxml)
				xmltmp.write('</file>')			
			
			#écriture fichier txt, redirection dans nvl arborescence
			txtpath = xmlpath.replace('xml', 'txt')
			with open(txtpath, 'w', encoding = 'utf-8') as txttmp:
				txttmp.write(corpustxt)

