#!/usr/bin/env python3
# -*- coding = utf-8 -*-

#script destiné aux années 2009 et 2010: extraction rubrique depuis titres page avec regex

import os
import re
import unidecode 
from bs4 import BeautifulSoup

def nettoyage(chaine):
	nvl_chaine = re.sub(r'<.*?>', '', chaine)
	if nvl_chaine and not nvl_chaine.endswith('.'):
		nvl_chaine += '.'
	return nvl_chaine
	
if not os.path.exists('OUTPUT/2009'):
	os.mkdir('OUTPUT/2009')
cat_dict = {} #dictionnaire pour stocker les catégories et leurs coordonnées

for root, dirs, files in os.walk('2009'):	#parcourir fichiers
	for name in files:
		#premier filtrage, on ne traite que les fichiers xml
		if name.endswith('.xml') and not name.startswith('fil'):
			filename = os.path.join(root, name)
			
			with open(filename, 'rb') as fichier:
				soup = BeautifulSoup(fichier, 'lxml-xml')	#parsing xml
				title_cat = soup.find('title')
				
				#extraction rubrique automatique, adaptée aux années 2009 et 2010
				if title_cat and title_cat.string : #deuxième filtrage, on enlève les fichiers inutiles type fil1514829634-v1
					title = title_cat.string
					list_cat = title.split(' - ')
					if len(list_cat) == 1:
						list_cat = title.split(' : ')
					if list_cat[0] != 'Le Monde.fr': 
						cat = list_cat[0]
					else:
						cat = list_cat[1]
						
					cat = cat.lower()
					cat = unidecode.unidecode(cat)
					cat = cat.replace(' ', '')
					
					path_cat = 'OUTPUT/2009/' + cat
					print(filename + ': ' + cat)
					
					xmlpath = path_cat + '/' + cat + '.xml'
					
					#construction arborescence si besoin
					if not os.path.exists(path_cat):
						os.mkdir(path_cat)
						cat_dict[cat] = [0, '']
						with open(xmlpath, 'w', encoding = 'utf-8') as xmltmp:
							xmltmp.write('<?xml version="1.0" encoding="utf-8"?>\n<file>\n</file>')
						
					#construction corpus	
					list_item = soup.find_all('item')
					
					for i in list_item:	#extraction info
						name_title_xml = i.title
						desc_xml = i.description

						#gestion doublons
						counter, titles = cat_dict[cat]
						if str(name_title_xml) not in titles:
							#traitement cas particulier <description/>, cf 2016 env_sciences
							#xml
							with open(xmlpath, 'r+', encoding = 'utf-8') as xmltmp:
								name_title = nettoyage(str(name_title_xml.string))
								xmltmp.seek(xmltmp.seek(0,2) - 7)
								xmltmp.write('\n<archive = ' + str(counter) + '>\n<title>' + name_title + "</title>\n")
								counter += 1

								if i.description and i.description.string :
									name_description = nettoyage(str(i.description.string))
									xmltmp.write('<description>' + name_description + '</description>' + "\n</archive>\n</file>")
								else:
									xmltmp.write("\n</archive>\n</file>")

							titles += str(name_title_xml)
								
							cat_dict[cat] = (counter, titles) #on actualise les coordonnées stockées dans le dictionnaire
						else:
							break

				
for root, dirs, files in os.walk('OUTPUT/2009') :
	for name in files:
		if name.endswith('.xml'):
			xmlpath = os.path.join(root, name)
			txtpath = xmlpath.replace('.xml', '.txt')
			print(txtpath)
			txttmp = open(txtpath, 'w', encoding = 'utf-8')
			with open(xmlpath, 'rb') as xmltmp:
				soup = BeautifulSoup(xmltmp, 'lxml-xml')
				arc_list = soup.find_all('archive')
			for i in arc_list:
				if i.title:
					txttmp.write(str(i.title.string) + '\n')
				if i.description:
					txttmp.write(str(i.description.string) + '\n\n')
			txttmp.close()
			
os.system('pause')