Skip to content

Commit 3b902ed

Browse files
committed
Initial version of a pack design that should be able to solve the problem nicely
streams: added pack specific Info and Stream types, including test
1 parent 0717775 commit 3b902ed

5 files changed

Lines changed: 298 additions & 42 deletions

File tree

fun.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,17 @@
1111

1212

1313
# INVARIANTS
14+
OFS_DELTA = 6
15+
REF_DELTA = 7
1416
type_id_to_type_map = {
17+
0 : "", # EXT 1
1518
1 : "commit",
1619
2 : "tree",
1720
3 : "blob",
18-
4 : "tag"
21+
4 : "tag",
22+
5 : "", # EXT 2
23+
OFS_DELTA : "OFS_DELTA", # OFFSET DELTA
24+
REF_DELTA : "REF_DELTA" # REFERENCE DELTA
1925
}
2026

2127
# used when dealing with larger streams
@@ -42,30 +48,23 @@ def loose_object_header_info(m):
4248
type_name, size = hdr[:hdr.find("\0")].split(" ")
4349
return type_name, int(size)
4450

45-
def object_header_info(m):
46-
""":return: tuple(type_string, uncompressed_size_in_bytes
47-
:param mmap: mapped memory map. It will be
48-
seeked to the actual start of the object contents, which can be used
49-
to initialize a zlib decompress object.
50-
:note: This routine can only handle new-style objects which are assumably contained
51-
in packs
52-
"""
53-
assert not is_loose_object(m), "Use loose_object_header_info instead"
54-
51+
def pack_object_header_info(data):
52+
""":return: tuple(type_id, uncompressed_size_in_bytes, byte_offset)
53+
The type_id should be interpreted according to the ``type_id_to_type_map`` map
54+
The byte-offset specifies the start of the actual zlib compressed datastream
55+
:param m: random-access memory, like a string or memory map"""
5556
c = b0 # first byte
5657
i = 1 # next char to read
5758
type_id = (c >> 4) & 7 # numeric type
5859
size = c & 15 # starting size
5960
s = 4 # starting bit-shift size
6061
while c & 0x80:
61-
c = ord(m[i])
62+
c = ord(data[i])
6263
i += 1
6364
size += (c & 0x7f) << s
6465
s += 7
6566
# END character loop
6667

67-
# finally seek the map to the start of the data stream
68-
m.seek(i)
6968
try:
7069
return (type_id_to_type_map[type_id], size)
7170
except KeyError:

pack.py

Lines changed: 134 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
1-
"""Contains PackIndex and PackFile implementations"""
1+
"""Contains PackIndexFile and PackFile implementations"""
22
from util import (
33
LockedFD,
44
LazyMixin,
55
file_contents_ro,
66
unpack_from
77
)
88

9+
from fun import (
10+
pack_object_header_info
11+
)
912
from struct import (
1013
pack,
1114
)
1215

13-
__all__ = ('PackIndex', 'Pack')
16+
__all__ = ('PackIndexFile', 'PackFile')
1417

1518

16-
class PackIndex(LazyMixin):
19+
class PackIndexFile(LazyMixin):
1720
"""A pack index provides offsets into the corresponding pack, allowing to find
1821
locations for offsets faster."""
1922

@@ -26,7 +29,7 @@ class PackIndex(LazyMixin):
2629
_sha_list_offset = 8 + 1024
2730

2831
def __init__(self, indexpath):
29-
super(PackIndex, self).__init__()
32+
super(PackIndexFile, self).__init__()
3033
self._indexpath = indexpath
3134

3235
def _set_cache_(self, attr):
@@ -121,9 +124,9 @@ def _initialize(self):
121124
self._fanout_table = self._read_fanout((self._version == 2) * 8)
122125

123126
if self._version == 2:
124-
self._crc_list_offset = self._sha_list_offset + self.size * 20
125-
self._pack_offset = self._crc_list_offset + self.size * 4
126-
self._pack_64_offset = self._pack_offset + self.size * 4
127+
self._crc_list_offset = self._sha_list_offset + self.size() * 20
128+
self._pack_offset = self._crc_list_offset + self.size() * 4
129+
self._pack_64_offset = self._pack_offset + self.size() * 4
127130
# END setup base
128131

129132
def _read_fanout(self, byte_offset):
@@ -139,21 +142,17 @@ def _read_fanout(self, byte_offset):
139142
#} END initialization
140143

141144
#{ Properties
142-
@property
143145
def version(self):
144146
return self._version
145147

146-
@property
147148
def size(self):
148149
""":return: amount of objects referred to by this index"""
149150
return self._fanout_table[255]
150151

151-
@property
152152
def packfile_checksum(self):
153153
""":return: 20 byte sha representing the sha1 hash of the pack file"""
154154
return self._data[-40:-20]
155155

156-
@property
157156
def indexfile_checksum(self):
158157
""":return: 20 byte sha representing the sha1 hash of this index file"""
159158
return self._data[-20:]
@@ -186,6 +185,128 @@ def sha_to_index(self, sha):
186185
#} END properties
187186

188187

189-
class Pack(LazyMixin):
190-
"""A pack is a file written according to the Version 2 for git packs"""
188+
class PackFile(LazyMixin):
189+
"""A pack is a file written according to the Version 2 for git packs
191190
191+
As we currently use memory maps, it could be assumed that the maximum size of
192+
packs therefor is 32 bit on 32 bit systems. On 64 bit systems, this should be
193+
fine though.
194+
195+
:note: at some point, this might be implemented using streams as well, or
196+
streams are an alternate path in the case memory maps cannot be created
197+
for some reason - one clearly doesn't want to read 10GB at once in that
198+
case"""
199+
200+
__slots__ = ('_packpath', '_data', '_size', '_version')
201+
202+
# offset into our data at which the first object starts
203+
_first_object_offset = 3*4 + 8
204+
205+
def __init__(self, packpath):
206+
self._packpath = packpath
207+
208+
def _set_cache_(self, attr):
209+
if attr == '_data':
210+
ldb = LockedFD(self._packpath)
211+
fd = ldb.open()
212+
self._data = file_contents_ro(fd)
213+
ldb.rollback()
214+
# TODO: figure out whether we should better keep the lock, or maybe
215+
# add a .keep file instead ?
216+
else:
217+
# read the header information
218+
type_id, self._version, self._size = unpack_from(">4sLL", self._data, 0)
219+
assert type_id == "PACK", "Pack file format is invalid: %r" % type_id
220+
assert self._version in (2, 3), "Cannot handle pack format version %i" % self._version
221+
# END handle header
222+
223+
def _iter_objects(self, start_offset, as_stream):
224+
"""Handle the actual iteration of objects within this pack"""
225+
size = len(self._data)
226+
cur_offset = start_offset or self._first_object_offset
227+
228+
while cur_offset < size:
229+
type_id, uncomp_size, data_offset = pack_object_header_info(buffer(self._data, cur_offset))
230+
231+
# if type_id
232+
# END until we have read everything
233+
234+
#{ Interface
235+
236+
def size(self):
237+
""":return: The amount of objects stored in this pack"""
238+
return self._size
239+
240+
def version(self):
241+
""":return: the version of this pack"""
242+
return self._version
243+
244+
def checksum(self):
245+
""":return: 20 byte sha1 hash on all object sha's contained in this file"""
246+
return self._data[-20:]
247+
248+
#} END interface
249+
250+
#{ Read-Database like Interface
251+
252+
def info(self, offset):
253+
"""Retrieve information about the object at the given file-absolute offset
254+
:param offset: byte offset
255+
:return: OPackInfo instance, the actual type differs depending on the type_id attribute"""
256+
raise NotImplementedError()
257+
258+
def stream(self, offset):
259+
"""Retrieve an object at the given file-relative offset as stream along with its information
260+
:param offset: byte offset
261+
:return: OPackStream instance, the actual type differs depending on the type_id attribute"""
262+
raise NotImplementedError()
263+
264+
#} END Read-Database like Interface
265+
266+
267+
class PackFileEntity(object):
268+
"""Combines the PackIndexFile and the PackFile into one, allowing the
269+
actual objects to be resolved and iterated"""
270+
271+
__slots__ = ('_index', '_pack')
272+
273+
IndexFileCls = PackIndexFile
274+
PackFileCls = PackFile
275+
276+
def __init__(self, basename):
277+
self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance
278+
self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance
279+
280+
281+
def _iter_objects(self, as_stream):
282+
raise NotImplementedError
283+
284+
#{ Read-Database like Interface
285+
286+
def info(self, sha):
287+
"""Retrieve information about the object identified by the given sha
288+
:param sha: 20 byte sha1
289+
:return: OInfo instance"""
290+
raise NotImplementedError()
291+
292+
def stream(self, sha):
293+
"""Retrieve an object stream along with its information as identified by the given sha
294+
:param sha: 20 byte sha1
295+
:return: OStream instance"""
296+
raise NotImplementedError()
297+
298+
#} END Read-Database like Interface
299+
300+
#{ Interface
301+
302+
def info_iter(self):
303+
""":return: Iterator over all objects in this pack. The iterator yields
304+
OInfo instances"""
305+
return self._iter_objects(as_stream=False)
306+
307+
def stream_iter(self):
308+
""":return: iterator over all objects in this pack. The iterator yields
309+
OStream instances"""
310+
return self._iter_objects(as_stream=True)
311+
312+
#} Interface

stream.py

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
zlib
1212
)
1313

14-
__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream',
14+
from fun import type_id_to_type_map
15+
16+
__all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo',
17+
'OStream', 'OPackStream', 'ODeltaPackStream',
18+
'IStream', 'InvalidOInfo', 'InvalidOStream',
1519
'DecompressMemMapReader', 'FDCompressedSha1Writer')
1620

1721

@@ -55,8 +59,42 @@ def type(self):
5559
def size(self):
5660
return self[2]
5761
#} END interface
58-
59-
62+
63+
64+
class OPackInfo(OInfo):
65+
"""As OInfo, but provides a type_id property to retrieve the numerical type id"""
66+
__slots__ = tuple()
67+
68+
@property
69+
def type(self):
70+
return type_id_to_type_map[self[1]]
71+
72+
#{ Interface
73+
74+
@property
75+
def type_id(self):
76+
return self[1]
77+
78+
#} interface
79+
80+
81+
class ODeltaPackInfo(OPackInfo):
82+
"""Adds delta specific information,
83+
Either the 20 byte sha which points to some object in the database,
84+
or the base_offset, being an offset into the pack at which our base
85+
can be found"""
86+
__slots__ = tuple()
87+
88+
def __new__(cls, sha, type, size, delta_info):
89+
return tuple.__new__(cls, (sha, type, size, delta_info))
90+
91+
#{ Interface
92+
@property
93+
def delta_info(self):
94+
return self[3]
95+
#} END interface
96+
97+
6098
class OStream(OInfo):
6199
"""Base for object streams retrieved from the database, providing additional
62100
information about the stream.
@@ -76,6 +114,46 @@ def __init__(self, *args, **kwargs):
76114
def read(self, size=-1):
77115
return self[3].read(size)
78116

117+
@property
118+
def stream(self):
119+
return self[3]
120+
#} END stream reader interface
121+
122+
123+
class OPackStream(OPackInfo):
124+
"""Next to pack object information, a stream outputting an undeltified base object
125+
is provided"""
126+
__slots__ = tuple()
127+
128+
def __new__(cls, sha, type, size, stream, *args):
129+
"""Helps with the initialization of subclasses"""
130+
return tuple.__new__(cls, (sha, type, size, stream))
131+
132+
#{ Stream Reader Interface
133+
def read(self, size=-1):
134+
return self[3].read(size)
135+
136+
@property
137+
def stream(self):
138+
return self[3]
139+
#} END stream reader interface
140+
141+
142+
class ODeltaPackStream(ODeltaPackInfo):
143+
"""Provides a stream outputting the uncompressed offset delta information"""
144+
__slots__ = tuple()
145+
146+
def __new__(cls, sha, type, size, delta_info, stream):
147+
return tuple.__new__(cls, (sha, type, size, delta_info, stream))
148+
149+
150+
#{ Stream Reader Interface
151+
def read(self, size=-1):
152+
return self[4].read(size)
153+
154+
@property
155+
def stream(self):
156+
return self[4]
79157
#} END stream reader interface
80158

81159

0 commit comments

Comments
 (0)