Skip to content

Commit 8b07235

Browse files
committed
Deprecate Patch.patch and add Patch.text and Patch.data
This is an attempt to finish up the work in PR libgit2#790 originally done by @erikvanzijst. Thanks to him for the initial work. Patch.patch assumes all content to be encoded in UTF-8 and forcefully replaces any non-decodeable sequences. This can lead to corruption for content that either does not conform to any specific encoding altogether, or uses an encoding that is incompatible with, or ambiguous to UTF-8. As discussed in libgit2#790, this change deprecates Patch.patch in favor of Patch.text and adds Patch.data, which returns the unmodified, raw bytes.
1 parent 75d0d80 commit 8b07235

File tree

4 files changed

+119
-31
lines changed

4 files changed

+119
-31
lines changed

src/patch.c

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -169,25 +169,63 @@ Patch_create_from(PyObject *self, PyObject *args, PyObject *kwds)
169169
return wrap_patch(patch, oldblob, newblob);
170170
}
171171

172+
PyDoc_STRVAR(Patch_data__doc__, "The raw bytes of the patch's contents.");
172173

173-
PyDoc_STRVAR(Patch_patch__doc__,
174-
"Patch diff string. Can be None in some cases, such as empty commits.");
174+
PyObject *
175+
Patch_data__get__(Patch *self)
176+
{
177+
git_buf buf = {NULL};
178+
int err;
179+
PyObject *bytes;
180+
181+
assert(self->patch);
182+
err = git_patch_to_buf(&buf, self->patch);
183+
if (err < 0)
184+
return Error_set(err);
185+
186+
bytes = PyBytes_FromStringAndSize(buf.ptr, buf.size);
187+
git_buf_dispose(&buf);
188+
return bytes;
189+
}
190+
191+
PyDoc_STRVAR(Patch_text__doc__,
192+
"Patch diff string. Can be None in some cases, such as empty commits.\n"
193+
"Note that this decodes the content to Unicode assuming UTF-8 encoding. "
194+
"For non-UTF-8 content that can lead be a lossy, non-reversible process. "
195+
"To access the raw, un-decoded patch, use `patch.data`.");
175196

176197
PyObject *
177-
Patch_patch__get__(Patch *self)
198+
Patch_text__get__(Patch *self)
178199
{
179200
git_buf buf = {NULL};
180201
int err;
181-
PyObject *py_patch;
202+
PyObject *text;
182203

183204
assert(self->patch);
184205
err = git_patch_to_buf(&buf, self->patch);
185206
if (err < 0)
186207
return Error_set(err);
187208

188-
py_patch = to_unicode(buf.ptr, NULL, NULL);
209+
text = to_unicode(buf.ptr, NULL, NULL);
189210
git_buf_dispose(&buf);
190-
return py_patch;
211+
return text;
212+
}
213+
214+
PyDoc_STRVAR(Patch_patch__doc__,
215+
"Patch diff string (deprecated -- use Patch.text instead).\n"
216+
"Can be None in some cases, such as empty commits. "
217+
"Note that this decodes the content to Unicode assuming UTF-8 encoding. "
218+
"For non-UTF-8 content that can lead be a lossy, non-reversible process. "
219+
"To access the raw, un-decoded patch, use `patch.data`.");
220+
221+
PyObject *
222+
Patch_patch__get__(Patch *self)
223+
{
224+
PyErr_WarnEx(PyExc_DeprecationWarning,
225+
"`Patch.patch` assumes UTF-8 encoding and can have unexpected results "
226+
"on other encodings. If decoded text is needed, use `Patch.text` "
227+
"instead. Otherwise use `Patch.data`.", 1);
228+
return Patch_text__get__(self);
191229
}
192230

193231
PyDoc_STRVAR(Patch_hunks__doc__, "hunks");
@@ -221,8 +259,10 @@ PyMethodDef Patch_methods[] = {
221259

222260
PyGetSetDef Patch_getsetters[] = {
223261
GETTER(Patch, delta),
224-
GETTER(Patch, patch),
225262
GETTER(Patch, line_stats),
263+
GETTER(Patch, data),
264+
GETTER(Patch, text),
265+
GETTER(Patch, patch),
226266
GETTER(Patch, hunks),
227267
{NULL}
228268
};

test/data/encoding.tar

49 KB
Binary file not shown.

test/test_blob.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,19 +172,19 @@ def test_diff_blob_to_buffer(self):
172172
def test_diff_blob_to_buffer_patch_patch(self):
173173
blob = self.repo[BLOB_SHA]
174174
patch = blob.diff_to_buffer("hello world")
175-
assert patch.patch == BLOB_PATCH
175+
assert patch.text == BLOB_PATCH
176176

177177
def test_diff_blob_to_buffer_delete(self):
178178
blob = self.repo[BLOB_SHA]
179179
patch = blob.diff_to_buffer(None)
180-
assert patch.patch == BLOB_PATCH_DELETED
180+
assert patch.text == BLOB_PATCH_DELETED
181181

182182
def test_diff_blob_create(self):
183183
old = self.repo[self.repo.create_blob(BLOB_CONTENT)]
184184
new = self.repo[self.repo.create_blob(BLOB_NEW_CONTENT)]
185185

186186
patch = old.diff(new)
187-
assert patch.patch == BLOB_PATCH_2
187+
assert patch.text == BLOB_PATCH_2
188188

189189
def test_blob_from_repo(self):
190190
blob = self.repo[BLOB_SHA]
@@ -193,4 +193,4 @@ def test_blob_from_repo(self):
193193
blob = self.repo[BLOB_SHA]
194194
patch_two = blob.diff_to_buffer(None)
195195

196-
assert patch_one.patch == patch_two.patch
196+
assert patch_one.text == patch_two.text

test/test_patch.py

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_patch_create_from_buffers(self):
9696
new_as_path=BLOB_NEW_PATH,
9797
)
9898

99-
assert patch.patch == BLOB_PATCH
99+
assert patch.text == BLOB_PATCH
100100

101101
def test_patch_create_from_blobs(self):
102102
old_blob = self.repo[BLOB_OLD_SHA]
@@ -109,7 +109,7 @@ def test_patch_create_from_blobs(self):
109109
new_as_path=BLOB_NEW_PATH,
110110
)
111111

112-
assert patch.patch == BLOB_PATCH2
112+
assert patch.text == BLOB_PATCH2
113113

114114
def test_patch_create_from_blob_buffer(self):
115115
old_blob = self.repo[BLOB_OLD_SHA]
@@ -120,7 +120,7 @@ def test_patch_create_from_blob_buffer(self):
120120
new_as_path=BLOB_NEW_PATH,
121121
)
122122

123-
assert patch.patch == BLOB_PATCH
123+
assert patch.text == BLOB_PATCH
124124

125125
def test_patch_create_from_blob_buffer_add(self):
126126
patch = pygit2.Patch.create_from(
@@ -130,7 +130,7 @@ def test_patch_create_from_blob_buffer_add(self):
130130
new_as_path=BLOB_NEW_PATH,
131131
)
132132

133-
assert patch.patch == BLOB_PATCH_ADDED
133+
assert patch.text == BLOB_PATCH_ADDED
134134

135135
def test_patch_create_from_blob_buffer_delete(self):
136136
old_blob = self.repo[BLOB_OLD_SHA]
@@ -142,7 +142,7 @@ def test_patch_create_from_blob_buffer_delete(self):
142142
new_as_path=BLOB_NEW_PATH,
143143
)
144144

145-
assert patch.patch == BLOB_PATCH_DELETED
145+
assert patch.text == BLOB_PATCH_DELETED
146146

147147
def test_patch_create_from_bad_old_type_arg(self):
148148
with pytest.raises(TypeError):
@@ -163,8 +163,8 @@ def test_context_lines(self):
163163
new_as_path=BLOB_NEW_PATH,
164164
)
165165

166-
context_count = (
167-
len([line for line in patch.patch.splitlines() if line.startswith(" ")])
166+
context_count = len(
167+
[line for line in patch.text.splitlines() if line.startswith(" ")]
168168
)
169169

170170
assert context_count != 0
@@ -181,13 +181,12 @@ def test_no_context_lines(self):
181181
context_lines=0,
182182
)
183183

184-
context_count = (
185-
len([line for line in patch.patch.splitlines() if line.startswith(" ")])
184+
context_count = len(
185+
[line for line in patch.text.splitlines() if line.startswith(" ")]
186186
)
187187

188188
assert context_count == 0
189189

190-
191190
def test_patch_create_blob_blobs(self):
192191
old_blob = self.repo[self.repo.create_blob(BLOB_OLD_CONTENT)]
193192
new_blob = self.repo[self.repo.create_blob(BLOB_NEW_CONTENT)]
@@ -199,7 +198,7 @@ def test_patch_create_blob_blobs(self):
199198
new_as_path=BLOB_NEW_PATH,
200199
)
201200

202-
assert patch.patch == BLOB_PATCH
201+
assert patch.text == BLOB_PATCH
203202

204203
def test_patch_create_blob_buffer(self):
205204
blob = self.repo[self.repo.create_blob(BLOB_OLD_CONTENT)]
@@ -210,7 +209,7 @@ def test_patch_create_blob_buffer(self):
210209
new_as_path=BLOB_NEW_PATH,
211210
)
212211

213-
assert patch.patch == BLOB_PATCH
212+
assert patch.text == BLOB_PATCH
214213

215214
def test_patch_create_blob_delete(self):
216215
blob = self.repo[self.repo.create_blob(BLOB_OLD_CONTENT)]
@@ -221,7 +220,7 @@ def test_patch_create_blob_delete(self):
221220
new_as_path=BLOB_NEW_PATH,
222221
)
223222

224-
assert patch.patch == BLOB_PATCH_DELETED
223+
assert patch.text == BLOB_PATCH_DELETED
225224

226225
def test_patch_create_blob_add(self):
227226
blob = self.repo[self.repo.create_blob(BLOB_NEW_CONTENT)]
@@ -232,7 +231,7 @@ def test_patch_create_blob_add(self):
232231
new_as_path=BLOB_NEW_PATH,
233232
)
234233

235-
assert patch.patch == BLOB_PATCH_ADDED
234+
assert patch.text == BLOB_PATCH_ADDED
236235

237236
def test_patch_delete_blob(self):
238237
blob = self.repo[BLOB_OLD_SHA]
@@ -246,24 +245,73 @@ def test_patch_delete_blob(self):
246245
# Make sure that even after deleting the blob the patch still has the
247246
# necessary references to generate its patch
248247
del blob
249-
assert patch.patch == BLOB_PATCH_DELETED
248+
assert patch.text == BLOB_PATCH_DELETED
250249

251250
def test_patch_multi_blob(self):
252251
blob = self.repo[BLOB_OLD_SHA]
253252
patch = pygit2.Patch.create_from(
254253
blob,
255254
None
256255
)
257-
patch_text = patch.patch
256+
patch_text = patch.text
258257

259258
blob = self.repo[BLOB_OLD_SHA]
260259
patch2 = pygit2.Patch.create_from(
261260
blob,
262261
None
263262
)
264-
patch_text2 = patch.patch
263+
patch_text2 = patch.text
265264

266265
assert patch_text == patch_text2
267-
assert patch_text == patch.patch
268-
assert patch_text2 == patch2.patch
269-
assert patch.patch == patch2.patch
266+
assert patch_text == patch.text
267+
assert patch_text2 == patch2.text
268+
assert patch.text == patch2.text
269+
270+
271+
class PatchEncodingTest(utils.AutoRepoTestCase):
272+
repo_spec = 'tar', 'encoding'
273+
expected_diff = b"""diff --git a/iso-8859-1.txt b/iso-8859-1.txt
274+
index e84e339..201e0c9 100644
275+
--- a/iso-8859-1.txt
276+
+++ b/iso-8859-1.txt
277+
@@ -1 +1,2 @@
278+
Kristian H\xf8gsberg
279+
+foo
280+
"""
281+
282+
def test_patch_from_non_utf8(self):
283+
# blobs encoded in ISO-8859-1
284+
old_content = b'Kristian H\xf8gsberg\n'
285+
new_content = old_content + b'foo\n'
286+
patch = pygit2.Patch.create_from(
287+
old_content,
288+
new_content,
289+
old_as_path='iso-8859-1.txt',
290+
new_as_path='iso-8859-1.txt',
291+
)
292+
293+
self.assertEqual(patch.data, self.expected_diff)
294+
295+
self.assertEqual(
296+
patch.text, self.expected_diff.decode('utf-8', errors='replace'))
297+
298+
# `patch.text` corrupted the ISO-8859-1 content as it forced UTF-8
299+
# decoding, so assert that we cannot get the original content back:
300+
self.assertNotEqual(patch.text.encode('utf-8'), self.expected_diff)
301+
302+
def test_patch_create_from_blobs(self):
303+
patch = pygit2.Patch.create_from(
304+
self.repo['e84e339ac7fcc823106efa65a6972d7a20016c85'],
305+
self.repo['201e0c908e3d9f526659df3e556c3d06384ef0df'],
306+
old_as_path='iso-8859-1.txt',
307+
new_as_path='iso-8859-1.txt',
308+
)
309+
310+
self.assertEqual(patch.data, self.expected_diff)
311+
312+
self.assertEqual(
313+
patch.text, self.expected_diff.decode('utf-8', errors='replace'))
314+
315+
# `patch.text` corrupted the ISO-8859-1 content as it forced UTF-8
316+
# decoding, so assert that we cannot get the original content back:
317+
self.assertNotEqual(patch.text.encode('utf-8'), self.expected_diff)

0 commit comments

Comments
 (0)