#!/usr/local/python/Python-2.3.4/bin/python
# -*- coding: utf-8 -*-

## external method to import endnote 8/9 xml exports ##

from xml.parsers import expat
from MySQLdb import escape_string

# keys we don't want to import (array of keys) (values are not imported)
noImport = ['source-app']
# keys we want to ignore (array of keys) (tags and values are imported for parent tag, if parent tag is in cleanVersion a version without the ignored tags is saved in key_clean)
#    e.g. '<author><style ...>Author</author>' -> {'author':'<style ...>Author</author>','author_clean':'Author'} (author_clean exists only if author is in arra cleanVersion)
ignoreImport = ['style']
# keys for which we want a clean version saved in key_clean
cleanVersion = ['title','abstract','authors','keywords','year']
# keys we want to be grouped when imported (dictionary: key is field name, value is separator used)
groupImport = {'author':'; ','url':'&#xD;', 'date':'; ', 'keyword':'; '}

class ENXMLParser:

	def __init__(self, data):
		self.tags = []
		self.cleanTag = 0
		self.values = {}
		self.records = []
		self.separator = 0
		
		self.parser = expat.ParserCreate(encoding='UTF-8')
		#self.parser = expat.ParserCreate()
		self.parser.StartElementHandler = self.Start_Element_Handler
		self.parser.EndElementHandler = self.End_Element_Handler
		self.parser.DefaultHandler = self.Default_Handler
		#self.parser.CharacterDataHandler = self.Default_Handler
		
		self.parser.Parse(data)

	def Start_Element_Handler(self, name, attrs) :
		if name not in ignoreImport :
		# we have a new key -> append to names and set separator to true
			self.tags.append(name)
			self.separator = 1
		else :
		# the actual tag has content that should be ignored -> save a cleaned version and add actual tag to non-clean-verion
			if self.getCurrentKey() in cleanVersion :
				self.cleanTag = 1
			startTag = '<'+name+' '+' '.join([key+'="'+attrs[key]+'"' for key in attrs])+'>'
			self.addValue(startTag,0)
			

	def End_Element_Handler(self, name) :
		if name not in ignoreImport :
		# end of key -> delete it from array
		 	del self.tags[-1]
		 	self.cleanTag = 0
		else :
			endTag = '</'+name+'>'
			self.addValue(endTag,0)
	 	if name == 'record' :
	 	# we are at the end of a record -> add record to self.records and flush self.values
	 		self.records.append(self.values)
	 		self.values = {}

	def Default_Handler(self, data) :
		self.addValue(data,1)
	
	def addValue(self,value,clean) :
	# adds value for tag and cleanTag if clean==1, adds value only for tag if clean==0
		if self.tags and self.tags[-1] not in noImport :
		# we want this value
			if len(self.tags) > 2 and self.tags[-1] in groupImport.keys() :
			# this value should be grouped
				if not self.values.has_key(self.tags[-2]) :
				# first tag -> add new key
					self.values[self.tags[-2]] = ''
					# it's the first tag, so we don't want a separator
					self.separator = 0
				if self.cleanTag and not self.values.has_key(self.tags[-2]+'_clean') :
				# we should make a clean version -> make a copy of the key to key_clean
					self.values[self.tags[-2]+'_clean'] = self.values[self.tags[-2]]
				if self.separator :
				# add separator
					self.values[self.tags[-2]] += groupImport[self.tags[-1]]
					if self.cleanTag :
						self.values[self.tags[-2]+'_clean'] += groupImport[self.tags[-1]]
				# add data to self.values
				self.values[self.tags[-2]] += value
				if self.cleanTag and clean :
					self.values[self.tags[-2]+'_clean'] += value
				# we want no more separators until next tag begins
				self.separator = 0
			else :
			# this value should not be grouped
				if not self.values.has_key(self.tags[-1]) :
				# first tag -> add new key
					self.values[self.tags[-1]] = ''
				if self.cleanTag and not self.values.has_key(self.tags[-1]+'_clean') :
				# we should make a clean version -> make a copy of the key to key_clean
					self.values[self.tags[-1]+'_clean'] = self.values[self.tags[-1]]
				# add data to self.values
				self.values[self.tags[-1]] += value
				if self.cleanTag and clean :
					self.values[self.tags[-1]+'_clean'] += value

	def getCurrentKey(self) :
	# returns the current key to be used
		if len(self.tags) > 2 and self.tags[-1] in groupImport.keys() :
			return self.tags[-2]
		elif self.tags :
			return self.tags[-1]
		else :
			return None


def escape(string):
    escape = ['\\', '\'', '"']
    for char in escape :
        string = string.replace(char,'\\%s' % char)
    return string

def endnoteImport(data) :
# parse xml-document
    p = ENXMLParser(data)
    queries = []
    for record in p.records:
        kk = '`'+'`,`'.join(record.keys())+'`'
        # vv = '"'+'","'.join([escape_string(x.encode('latin1','replace')) for x in record.values()])+'"'
        vv = '"'+'","'.join([escape(x) for x in record.values()])+'"'
        kk = kk.encode('utf8','replace')
        vv = vv.encode('utf8','replace')
        queries.append((kk,vv))
    return queries


