XMLparser

24 ''' 25 This module recursively parses an XML map file looking for supported xml tags. These tags contain metadata about an HDF 26 object in an HDF file. If a supported tag is found the class creates an instance of "HDFhandler", this class will 27 load the HDF file and return the object data in a normalized buffer. 28 29 This buffer and the xml tag are passed then to the handler class(in this iteration just VData). 30 The handler classes are in charge of reconstruct the HDF objects and return them as Python data structures. 31 32 '''

33 - def __init__(self,hdf_path,map_file,operation,hdf_object,dump_format):

34 ''' 35 Constructor: 36 37 Initialize the internal variables. 38 39 XMLfile: Name of the XML map file from the command line argument. 40 self.tree: An ElementTree instance, loads XML documents in a tree structure. 41 self.group_stack: Stack of group names later used to name the output files. 42 43 ''' 44 self.xml_file= map_file 45 self.depth=0# This variable stores the level of a node, it is used to name and format groups 46 self.schema="{http://schemas.hdfgroup.org/hdf4/h4}" #Etree uses full schema name spaces 47 self.dump_format=dump_format 48 49 50 self.tree = etree.parse(self.xml_file).getroot() #Parse the XML document and get the root tag 51 self.hdf_file_name=self.tree.attrib["name"] #Wgets the HDF filename 52 self.hdf_path=hdf_path # Stores the relative path to the HDF file 53 self.hdf_object=hdf_object 54 self.hdf_operation=operation 55 56 # A new instance of HDFfile, this class knows how to extract the buffer of an object from the HDF file 57 self.hdf_handler=HDFfile(self.hdf_path + self.hdf_file_name) 58 59 60 self.group_stack=[] 61 self.external_files={}#Will store the references to external files 62 #A new instance of VData, the class that knows how to reconstruct the VData object from its linear buffer 63 self.vdata=VData() 64 self.SDS=SDS() 65 self.vdata_table=[]#This list will store the VData tables. 66 self.SDS_table=[] 67 68 self.utils=utils()

69 #self.SDS=SDS() 70

71 - def parseXML(self):

72 ''' 73 It parses the XML map file using the ElementTree API in a recursive function call "recursiveWalk". 74 ''' 75 76 self.group_stack.append("Root--") #We maintain a hierarchy to name the extracted objects; this is the first prefix. 77 self.recursiveWalk(self.tree,1)

78 79

80 - def recursiveWalk(self,node,depth):

81 ''' 82 This recursive function traverse the XML document using the ElementTree API; all the nodes are stored in a tree-like structure. 83 If a tag is recognized the method uses "self.operation" to either print a short version of the XML file 84 or extract the object into to a CSV file. 85 86 If a 'Group' tag is found, the attribute 'ID' is inserted in a stack; its children will have this value as prefix for the file name. 87 This is accumulative, if a given VData object is under the group ID_ABC and ID_DEF the CSV file will be named: G-ID_ABC-G-ID_DEF.csv 88 89 ''' 90 self.depth=depth 91 for children in node.getchildren(): 92 93 if children.tag==(self.schema+ "ExternalFile"):# We store the location and ID of extarnal files in a Python directory 94 self.external_files[str(children.attrib["id"])]=str(children.attrib["location"]) +"/" + str(children.attrib["filename"]) 95 print self.external_files 96 97 if children.tag==(self.schema+ "Group"): 98 #print self.depth 99 if self.hdf_operation=="l": 100 print "-" *self.depth + "Group: " +children.attrib["name"] 101 else: 102 if self.depth>=len(self.group_stack): 103 self.group_stack.append('_G-' + children.attrib["id"] + '_') 104 else: 105 self.group_stack.pop() 106 #print self.group_stack 107 108 # VData 109 if children.tag==(self.schema+ "Table"): 110 if self.hdf_operation=="l": 111 print "-" *self.depth + "VData: " +children.attrib["name"] 112 else: 113 print "-" *self.depth + "VData: " +children.attrib["name"] 114 data_node=children.find(self.schema + "tableData") 115 inExternalFile_node=data_node.getchildren() 116 if inExternalFile_node[0].tag==(self.schema+ "dataInExternalFile"): 117 118 #If a table is stored in an external file we create a temporary instance of 119 # HDFfile to buffer the object from that file. 120 data_buffer=HDFfile(self.hdf_path + self.external_files[inExternalFile_node[0].attrib["ref"]]).linearizeDataSpace(inExternalFile_node[0],"VData") 121 else: 122 #If the data is stored in the same HDF file we just get the object data from the HDF file 123 # In this process, we send the XML node TableData to the HDFfile class, 124 # the class will use the information to extract the object and return it in a linear buffer. 125 data_buffer=self.hdf_handler.linearizeDataSpace(data_node,"VData") 126 127 self.vdata_table=self.vdata.Extract(children,data_buffer,self.dump_format) 128 temp_file_name= self.xml_file + "_dump/" + "".join(self.group_stack) + node.attrib["name"]+ " " + children.attrib["id"] 129 #print "REF " + temp_file_name 130 131 if self.dump_format==False:#If dump_format is None we dump the data in ASCII into CSV 132 self.utils.createCSVfromTable(self.vdata_table,temp_file_name) 133 else: #If we want the data in binary we dump it in .dat files 134 self.utils.createPlainDatFile(self.vdata_table, temp_file_name) 135 136 # SDS 137 elif children.tag==(self.schema+ "Array"): 138 if self.hdf_operation=="l": 139 print "-" *self.depth + "Array: " +children.attrib["name"] 140 else: 141 print "-" *self.depth + "Array: " +children.attrib["name"] 142 data_node=children.find(self.schema + "arrayData") 143 data_buffer=None 144 if not etree.iselement(data_node):#if we couldn't find an arrayData tag 145 print "arrayData not found" 146 else: 147 148 inExternalFile_node=data_node.getchildren() 149 if inExternalFile_node[0].tag==(self.schema+ "dataInExternalFile"): 150 151 #If a table is stored in an external file we create a temporary instance of 152 # HDFfile to buffer the object from that file. 153 print "External data" 154 data_buffer=HDFfile(self.hdf_path + self.external_files[inExternalFile_node[0].attrib["ref"]]).linearizeDataSpace(children,"SDS") 155 else: 156 #If the data is stored in the same HDF file we just get the data from the HDF file 157 # In this process, we send the XML node SDS to the HDFfile class, 158 # the class will use the information to extract the object and return it in a linear buffer. 159 data_buffer=self.hdf_handler.linearizeDataSpace(children,"SDS") 160 161 temp_file_name= self.xml_file + "_dump/" + "".join(self.group_stack)+ node.attrib["name"]+ " " +children.attrib["id"] 162 if self.dump_format==False: 163 #If dump_format is None we dump the data in ASCII into CSV 164 if data_buffer!=None: 165 self.SDS_table=self.SDS.Extract(children, data_buffer) 166 self.utils.createCSVfromTable(self.SDS_table,temp_file_name) 167 else: #If we want the data in binary we dump it in .dat files 168 if data_buffer!=None: 169 self.utils.createPlainDatFile(data_buffer.getvalue(), temp_file_name) 170 171 172 if len(children)>0: 173 self.recursiveWalk(children,self.depth+1) 174 self.depth=self.depth-1

Source Code for Module XMLparser