HDFhandler

15 ''' 16 This class loads an HDF file in a Python file object and uses an XML tag from the map file to extract 17 and reconstruct that object. The method linearizeDataSpace is used to put all the possible parts together. 18 '''

19 - def __init__(self,hdf_fileName):

20 ''' 21 Constructor 22 ''' 23 self.utils=utils() 24 self.schema="{http://schemas.hdfgroup.org/hdf4/h4}" #Etree uses full schema name spaces 25 self.external_handler=[] 26 try: #it opens the HDF creating a file object 27 file_path=path.normpath(hdf_fileName) 28 self.file_handler=file(file_path,"rb") 29 except: 30 print "HDF file not found: " + hdf_fileName, file_path 31 exit(1)

32 33

34 - def linearizeDataSpace(self,node,type): #offsets

35 ''' 36 The function will put together all the chunks/blocks/cubes of a data structure into a linear buffer 37 the parameter "node" has to be an XML tag with the necessary information about how the object is stored in the HDF file. 38 ''' 39 40 tmp_buffer_object= StringIO() 41 42 43 if type=="VData":#this is just to backward compatibility with version 11 44 #in the future all types should be handled by the same operations 45 for chunk in node.getiterator(self.schema+"byteStream"):#iterates the tags "ByteStream 46 self.file_handler.seek(int(chunk.attrib["offset"]),0) 47 #if chunk.attrib["compression"] in ("zlib","szip","DEFLATE"): 48 tmp_buffer_object.write(self.file_handler.read(int(chunk.attrib["nBytes"]))) 49 return tmp_buffer_object 50 51 52 53 elif type=="SDS": 54 dataDimensionSizes=node.find(self.schema + "dataDimensionSizes").text.split(" ") 55 DataNode=node.find(self.schema + "datum") 56 mapped_type=DataNode.attrib["dataType"] 57 try: 58 byte_order=DataNode.attrib["byteOrder"] 59 except: 60 byte_order="bigEndian" 61 62 py_format,item_size,py_endianness=self.utils.getPythonFormat(mapped_type,byte_order) 63 64 arrayData=node.find(self.schema + "arrayData") 65 spatialPart=arrayData.getchildren() 66 try: 67 compressionType=arrayData.attrib["compressionType"] 68 fastestVaryingDimensionIndex=arrayData.attrib["fastestVaryingDimensionIndex"] 69 except: 70 compressionType="None" 71 fastestVaryingDimensionIndex="1" 72 73 74 if spatialPart[0].tag==self.schema+"byteStream" or spatialPart[0].tag==self.schema+"byteStreamSet": 75 #debug.info("Segmented in stream sets") 76 #Here the streams are already linearized and orderedbyte_order 77 #thus we do not need n dimensional processing 78 for stream in spatialPart: 79 if stream.tag==self.schema + "byteStreamSet": 80 unzipped_subChunks="" 81 for subChunks in stream.getchildren(): 82 self.file_handler.seek(int(subChunks.attrib["offset"]),0) 83 unzipped_subChunks+=self.file_handler.read(int(subChunks.attrib["nBytes"])) 84 if compressionType!="None": 85 unzipped_bytes=self.utils.inflate64(unzipped_subChunks) 86 else: 87 unzipped_bytes=unzipped_subChunks 88 89 elif stream.tag==self.schema + "byteStream": 90 self.file_handler.seek(int(stream.attrib["offset"]),0) 91 if compressionType!="None": 92 unzipped_bytes=self.utils.inflate64(self.file_handler.read(int(stream.attrib["nBytes"]))) 93 else: 94 unzipped_bytes=self.file_handler.read(int(stream.attrib["nBytes"])) 95 tmp_buffer_object.write(unzipped_bytes) 96 return tmp_buffer_object 97 98 elif spatialPart[0].tag==self.schema+"chunks": 99 #debug.info( "Segmented in chunks") 100 #In here we need to put the chunks into their positions on the linear buffer 101 chunkDimensionSizes=spatialPart[0].find(self.schema+"chunkDimensionSizes").text.split(" ") 102 try: 103 allocatedDimensionSizes=node.find(self.schema + "allocatedDimensionSizes").text.split(" ") 104 except: 105 allocatedDimensionSizes=dataDimensionSizes#same size 106 chunkChkSize=1 107 #now we find how many elements we will be reading from n Dimensions 108 for dim in chunkDimensionSizes: 109 chunkChkSize*=int(dim) 110 111 chunkChkSize*=item_size 112 113 bufferSize=1 114 for dim in allocatedDimensionSizes: 115 bufferSize*=int(dim) 116 117 nDim=len(allocatedDimensionSizes) 118 bufferSize*=item_size 119 120 121 tmp=create_string_buffer(bufferSize) 122 tmp_buffer_object.write(str(tmp)) 123 tmp=None 124 125 for stream in spatialPart[0].getchildren(): 126 unzipped_bytes=None 127 if stream.tag==self.schema + "chunkDimensionSizes": 128 continue 129 if stream.tag==self.schema + "byteStreamSet": 130 unzipped_subChunks="" 131 chunkPos= stream.attrib["chunkPositionInArray"].replace("[","") 132 chunkPos=chunkPos.replace("]","").split(",") 133 for subChunks in stream.getchildren(): 134 self.file_handler.seek(int(subChunks.attrib["offset"]),0) 135 unzipped_subChunks+=self.file_handler.read(int(subChunks.attrib["nBytes"])) 136 if compressionType!="None": 137 unzipped_bytes=self.utils.inflate64(unzipped_subChunks) 138 if len(unzipped_bytes)!=chunkChkSize: 139 print "Error: uncompressed data size does not match the chunk size: ",len(unzipped_bytes), " ! ", chunkChkSize 140 return None 141 else: 142 unzipped_bytes=unzipped_subChunks 143 144 elif stream.tag==self.schema + "byteStream": 145 #XXXXXXXXXXXXX 146 chunkPos= stream.attrib["chunkPositionInArray"].replace("[","") 147 chunkPos=chunkPos.replace("]","").split(",") 148 self.file_handler.seek(int(stream.attrib["offset"]),0) 149 if compressionType!="None": 150 unzipped_bytes=self.utils.inflate64(self.file_handler.read(int(stream.attrib["nBytes"]))) 151 if len(unzipped_bytes)!=chunkChkSize: 152 print "Error: uncompressed data size does not match the chunk size: ",len(unzipped_bytes), " ! ", chunkChkSize 153 return None 154 else: 155 unzipped_bytes=self.file_handler.read(int(stream.attrib["nBytes"])) 156 157 #Now we put the chunk into its position in the linear buffer 158 if nDim==2: 159 if fastestVaryingDimensionIndex=="2": 160 #Falta un for ? 161 base=int(chunkPos[1]) 162 chunkOffset=0 163 for columns in range(base,base+int(chunkDimensionSizes[1])): 164 linearOffset=(columns*int(allocatedDimensionSizes[0]))+int(chunkPos[0]) 165 tmp_buffer_object.seek(linearOffset*item_size) 166 col_elements=unzipped_bytes[chunkOffset:chunkOffset+int(chunkDimensionSizes[0])*item_size] 167 tmp_buffer_object.write(col_elements) 168 #chunkBuffer[linearOffset:linearOffset+int(chunkDimensionSizes[0])]= struct.unpack_from(py_endianness+chunkDimensionSizes[0]+py_format,unzipped_bytes,chunkOffset) 169 #print linearOffset,base,columns,chunkPos,struct.unpack_from(py_endianness+chunkDimensionSizes[0]+py_format,unzipped_bytes,chunkOffset) 170 chunkOffset+=int(chunkDimensionSizes[0])*item_size 171 else: 172 base=int(chunkPos[0]) 173 chunkOffset=0 174 for rows in range(base,base+int(chunkDimensionSizes[0])): 175 linearOffset=(rows*int(allocatedDimensionSizes[1]))+int(chunkPos[1]) 176 tmp_buffer_object.seek(linearOffset*item_size) 177 #badddddddd unzipped tiene que ser solo de la fila 178 row_elements=unzipped_bytes[chunkOffset:chunkOffset+int(chunkDimensionSizes[1])*item_size] 179 tmp_buffer_object.write(row_elements) 180 #print linearOffset,base,rows,chunkPos,struct.unpack_from(py_endianness+chunkDimensionSizes[1]+py_format,unzipped_bytes,chunkOffset) 181 #chunkBuffer[linearOffset:linearOffset+int(chunkDimensionSizes[1])]= struct.unpack_from(py_endianness+chunkDimensionSizes[1]+py_format,unzipped_bytes,chunkOffset) 182 #print linearOffset,base,rows,chunkPos,chunkBuffer[linearOffset:linearOffset+int(chunkDimensionSizes[1])] 183 chunkOffset+=int(chunkDimensionSizes[1])*item_size 184 185 elif nDim>=3: 186 print "Chunked N dimensional SDS are not mapped yet" 187 return None 188 189 190 return tmp_buffer_object 191 192 elif spatialPart[0].tag==self.schema+"dataInExternalFile": 193 for stream in spatialPart[0].getchildren(): 194 if stream.tag==self.schema + "byteStreamSet": 195 unzipped_subChunks="" 196 for subChunks in stream.getchildren(): 197 self.file_handler.seek(int(subChunks.attrib["offset"]),0) 198 unzipped_subChunks+=self.file_handler.read(int(subChunks.attrib["nBytes"])) 199 if compressionType!="None": 200 unzipped_bytes=self.utils.inflate64(unzipped_subChunks) 201 else: 202 unzipped_bytes=unzipped_subChunks 203 204 elif stream.tag==self.schema + "byteStream": 205 #XXXXXXXXXXXXX 206 self.file_handler.seek(int(stream.attrib["offset"]),0) 207 if compressionType!="None": 208 unzipped_bytes=self.utils.inflate64(self.file_handler.read(int(stream.attrib["nBytes"]))) 209 else: 210 unzipped_bytes=self.file_handler.read(int(stream.attrib["nBytes"])) 211 if len(unzipped_bytes)>0: 212 tmp_buffer_object.write(unzipped_bytes) 213 return tmp_buffer_object 214 215 return tmp_buffer_object

Source Code for Module HDFhandler