gitpython-developers
diff --git a/‎fun.py‎
Lines changed: 13 additions & 14 deletions b/‎fun.py‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎pack.py‎
Lines changed: 134 additions & 13 deletions b/‎pack.py‎
Lines changed: 134 additions & 13 deletions
diff --git a/‎stream.py‎
Lines changed: 81 additions & 3 deletions b/‎stream.py‎
Lines changed: 81 additions & 3 deletions
@@ -11,11 +11,17 @@
 
 
 # INVARIANTS
+OFS_DELTA = 6
+REF_DELTA = 7
 type_id_to_type_map = 	{
+							0 : "",				# EXT 1
 							1 : "commit",
 							2 : "tree",
 							3 : "blob",
-							4 : "tag"
+							4 : "tag",
+							5 : "",				# EXT 2
+							OFS_DELTA : "OFS_DELTA", 	# OFFSET DELTA
+							REF_DELTA : "REF_DELTA"		# REFERENCE DELTA
 						}
 
 # used when dealing with larger streams
@@ -42,30 +48,23 @@ def loose_object_header_info(m):
 	type_name, size = hdr[:hdr.find("\0")].split(" ")
 	return type_name, int(size)
 
-def object_header_info(m):
-	""":return: tuple(type_string, uncompressed_size_in_bytes 
-	:param mmap: mapped memory map. It will be 
-		seeked to the actual start of the object contents, which can be used
-		to initialize a zlib decompress object.
-	:note: This routine can only handle new-style objects which are assumably contained
-		in packs
-		"""
-	assert not is_loose_object(m), "Use loose_object_header_info instead"
-	
+def pack_object_header_info(data):
+	""":return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)
+	The type_id should be interpreted according to the ``type_id_to_type_map`` map
+	The byte-offset specifies the start of the actual zlib compressed datastream
+	:param m: random-access memory, like a string or memory map"""
 	c = b0							# first byte
 	i = 1							# next char to read
 	type_id = (c >> 4) & 7			# numeric type
 	size = c & 15					# starting size
 	s = 4							# starting bit-shift size
 	while c & 0x80:
-		c = ord(m[i])
+		c = ord(data[i])
 		i += 1
 		size += (c & 0x7f) << s
 		s += 7
 	# END character loop
 
-	# finally seek the map to the start of the data stream
-	m.seek(i)
 	try:
 		return (type_id_to_type_map[type_id], size)
 	except KeyError:
 
@@ -1,19 +1,22 @@
-"""Contains PackIndex and PackFile implementations"""
+"""Contains PackIndexFile and PackFile implementations"""
 from util import (
 					LockedFD,
 					LazyMixin,
 					file_contents_ro, 
 					unpack_from
 					)
 
+from fun import (
+					pack_object_header_info
+				)
 from struct import (
 						pack,
 					)
 
-__all__ = ('PackIndex', 'Pack')
+__all__ = ('PackIndexFile', 'PackFile')
 
 
-class PackIndex(LazyMixin):
+class PackIndexFile(LazyMixin):
 	"""A pack index provides offsets into the corresponding pack, allowing to find
 	locations for offsets faster."""
 
@@ -26,7 +29,7 @@ class PackIndex(LazyMixin):
 	_sha_list_offset = 8 + 1024
 
 	def __init__(self, indexpath):
-		super(PackIndex, self).__init__()
+		super(PackIndexFile, self).__init__()
 		self._indexpath = indexpath
 
 	def _set_cache_(self, attr):
@@ -121,9 +124,9 @@ def _initialize(self):
 		self._fanout_table = self._read_fanout((self._version == 2) * 8)
 
 		if self._version == 2:
-			self._crc_list_offset = self._sha_list_offset + self.size * 20
-			self._pack_offset = self._crc_list_offset + self.size * 4
-			self._pack_64_offset = self._pack_offset + self.size * 4
+			self._crc_list_offset = self._sha_list_offset + self.size() * 20
+			self._pack_offset = self._crc_list_offset + self.size() * 4
+			self._pack_64_offset = self._pack_offset + self.size() * 4
 		# END setup base
 
 	def _read_fanout(self, byte_offset):
@@ -139,21 +142,17 @@ def _read_fanout(self, byte_offset):
 	#} END initialization
 
 	#{ Properties
-	@property
 	def version(self):
 		return self._version
 
-	@property
 	def size(self):
 		""":return: amount of objects referred to by this index"""
 		return self._fanout_table[255]
 
-	@property
 	def packfile_checksum(self):
 		""":return: 20 byte sha representing the sha1 hash of the pack file"""
 		return self._data[-40:-20]
 
-	@property
 	def indexfile_checksum(self):
 		""":return: 20 byte sha representing the sha1 hash of this index file"""
 		return self._data[-20:]
@@ -186,6 +185,128 @@ def sha_to_index(self, sha):
 	#} END properties
 
 
-class Pack(LazyMixin):
-	"""A pack is a file written according to the Version 2 for git packs"""
+class PackFile(LazyMixin):
+	"""A pack is a file written according to the Version 2 for git packs
 	
+	As we currently use memory maps, it could be assumed that the maximum size of
+	packs therefor is 32 bit on 32 bit systems. On 64 bit systems, this should be 
+	fine though.
+	
+	:note: at some point, this might be implemented using streams as well, or 
+		streams are an alternate path in the case memory maps cannot be created
+		for some reason - one clearly doesn't want to read 10GB at once in that 
+		case"""
+	
+	__slots__ = ('_packpath', '_data', '_size', '_version')
+	
+	# offset into our data at which the first object starts
+	_first_object_offset = 3*4 + 8
+	
+	def __init__(self, packpath):
+		self._packpath = packpath
+		
+	def _set_cache_(self, attr):
+		if attr == '_data':
+			ldb = LockedFD(self._packpath)
+			fd = ldb.open()
+			self._data = file_contents_ro(fd)
+			ldb.rollback()
+			# TODO: figure out whether we should better keep the lock, or maybe
+			# add a .keep file instead ?
+		else:
+			# read the header information
+			type_id, self._version, self._size = unpack_from(">4sLL", self._data, 0)
+			assert type_id == "PACK", "Pack file format is invalid: %r" % type_id
+			assert self._version in (2, 3), "Cannot handle pack format version %i" % self._version
+		# END handle header
+		
+	def _iter_objects(self, start_offset, as_stream):
+		"""Handle the actual iteration of objects within this pack"""
+		size = len(self._data)
+		cur_offset = start_offset or self._first_object_offset
+		
+		while cur_offset < size:
+			type_id, uncomp_size, data_offset = pack_object_header_info(buffer(self._data, cur_offset))
+			
+			# if type_id 
+		# END until we have read everything
+		
+	#{ Interface
+	
+	def size(self):
+		""":return: The amount of objects stored in this pack""" 
+		return self._size
+		
+	def version(self):
+		""":return: the version of this pack"""
+		return self._version
+		
+	def checksum(self):
+		""":return: 20 byte sha1 hash on all object sha's contained in this file"""
+		return self._data[-20:]
+		
+	#} END interface
+	
+	#{ Read-Database like Interface
+	
+	def info(self, offset):
+		"""Retrieve information about the object at the given file-absolute offset
+		:param offset: byte offset
+		:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
+		raise NotImplementedError()
+		
+	def stream(self, offset):
+		"""Retrieve an object at the given file-relative offset as stream along with its information
+		:param offset: byte offset
+		:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
+		raise NotImplementedError()
+		
+	#} END Read-Database like Interface
+	
+	
+class PackFileEntity(object):
+	"""Combines the PackIndexFile and the PackFile into one, allowing the 
+	actual objects to be resolved and iterated"""
+	
+	__slots__ = ('_index', '_pack')
+	
+	IndexFileCls = PackIndexFile
+	PackFileCls = PackFile
+	
+	def __init__(self, basename):
+		self._index = self.IndexFileCls("%s.idx" % basename)			# PackIndexFile instance
+		self._pack = self.PackFileCls("%s.pack" % basename)			# corresponding PackFile instance
+	
+	
+	def _iter_objects(self, as_stream):
+		raise NotImplementedError
+	
+	#{ Read-Database like Interface
+	
+	def info(self, sha):
+		"""Retrieve information about the object identified by the given sha
+		:param sha: 20 byte sha1
+		:return: OInfo instance"""
+		raise NotImplementedError()
+		
+	def stream(self, sha):
+		"""Retrieve an object stream along with its information as identified by the given sha
+		:param sha: 20 byte sha1
+		:return: OStream instance"""
+		raise NotImplementedError()
+		
+	#} END Read-Database like Interface
+	
+	#{ Interface 
+	
+	def info_iter(self):
+		""":return: Iterator over all objects in this pack. The iterator yields
+			OInfo instances"""
+		return self._iter_objects(as_stream=False)
+		
+	def stream_iter(self):
+		""":return: iterator over all objects in this pack. The iterator yields
+		OStream instances"""
+		return self._iter_objects(as_stream=True)
+		
+	#} Interface
@@ -11,7 +11,11 @@
 		zlib
 	)
 
-__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', 
+from fun import type_id_to_type_map 
+
+__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo', 
+			'OStream', 'OPackStream', 'ODeltaPackStream',
+			'IStream', 'InvalidOInfo', 'InvalidOStream', 
 			'DecompressMemMapReader', 'FDCompressedSha1Writer')
 
 
@@ -55,8 +59,42 @@ def type(self):
 	def size(self):
 		return self[2]
 	#} END interface
-
-
+	
+	
+class OPackInfo(OInfo):
+	"""As OInfo, but provides a type_id property to retrieve the numerical type id"""
+	__slots__ = tuple()
+	
+	@property
+	def type(self):
+		return type_id_to_type_map[self[1]]
+	
+	#{ Interface 
+	
+	@property
+	def type_id(self):
+		return self[1]
+		
+	#} interface
+		
+		
+class ODeltaPackInfo(OPackInfo):
+	"""Adds delta specific information, 
+	Either the 20 byte sha which points to some object in the database, 
+	or the base_offset, being an offset into the pack at which our base 
+	can be found"""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, type, size, delta_info):
+		return tuple.__new__(cls, (sha, type, size, delta_info))
+		
+	#{ Interface 
+	@property
+	def delta_info(self):
+		return self[3]
+	#} END interface 
+	
+	
 class OStream(OInfo):
 	"""Base for object streams retrieved from the database, providing additional 
 	information about the stream.
@@ -76,6 +114,46 @@ def __init__(self, *args, **kwargs):
 	def read(self, size=-1):
 		return self[3].read(size)
 
+	@property
+	def stream(self):
+		return self[3]
+	#} END stream reader interface
+	
+	
+class OPackStream(OPackInfo):
+	"""Next to pack object information, a stream outputting an undeltified base object
+	is provided"""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, type, size, stream, *args):
+		"""Helps with the initialization of subclasses"""
+		return tuple.__new__(cls, (sha, type, size, stream))
+		
+	#{ Stream Reader Interface 
+	def read(self, size=-1):
+		return self[3].read(size)
+		
+	@property
+	def stream(self):
+		return self[3]
+	#} END stream reader interface
+
+	
+class ODeltaPackStream(ODeltaPackInfo):
+	"""Provides a stream outputting the uncompressed offset delta information"""
+	__slots__ = tuple()
+	
+	def __new__(cls, sha, type, size, delta_info, stream):
+		return tuple.__new__(cls, (sha, type, size, delta_info, stream))
+
+
+	#{ Stream Reader Interface 
+	def read(self, size=-1):
+		return self[4].read(size)
+		
+	@property
+	def stream(self):
+		return self[4]
 	#} END stream reader interface