Skip to content

Commit 86b1f59

Browse files
committed
Making MD5 hash method take a buffer instead of bytes.
This is to account for files too large to fit into memory.
1 parent e062c89 commit 86b1f59

3 files changed

Lines changed: 49 additions & 10 deletions

File tree

gcloud/storage/_helpers.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -192,12 +192,28 @@ def _setter(self, value):
192192
return property(_getter, _setter)
193193

194194

195-
def _base64_md5hash(bytes_to_sign):
195+
def _write_buffer_to_hash(buffer_object, hash_obj, digest_block_size=8192):
196+
"""Read blocks from a buffer and update a hash with them.
197+
198+
:type buffer_object: bytes buffer
199+
:param buffer_object: Buffer containing bytes used to update a hash object.
200+
"""
201+
block = buffer_object.read(digest_block_size)
202+
203+
while len(block) > 0:
204+
hash_obj.update(block)
205+
# Update the block for the next iteration.
206+
block = buffer_object.read(digest_block_size)
207+
208+
209+
def _base64_md5hash(buffer_object):
196210
"""Get MD5 hash of bytes (as base64).
197211
198-
:type bytes_to_sign: bytes
199-
:param bytes_to_sign: Bytes used to compute an MD5 hash (as base64).
212+
:type buffer_object: bytes buffer
213+
:param buffer_object: Buffer containing bytes used to compute an MD5
214+
hash (as base64).
200215
"""
201-
hash = MD5.new(data=bytes_to_sign)
202-
digest_bytes = hash.digest()
216+
hash_obj = MD5.new()
217+
_write_buffer_to_hash(buffer_object, hash_obj)
218+
digest_bytes = hash_obj.digest()
203219
return base64.b64encode(digest_bytes)

gcloud/storage/test__helpers.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,26 +244,44 @@ def _callFUT(self, bytes_to_sign):
244244
return _base64_md5hash(bytes_to_sign)
245245

246246
def test_it(self):
247+
from io import BytesIO
247248
BYTES_TO_SIGN = b'FOO'
248-
SIGNED_CONTENT = self._callFUT(BYTES_TO_SIGN)
249+
BUFFER = BytesIO()
250+
BUFFER.write(BYTES_TO_SIGN)
251+
BUFFER.seek(0)
252+
253+
SIGNED_CONTENT = self._callFUT(BUFFER)
249254
self.assertEqual(SIGNED_CONTENT, b'kBiQqOnIz21aGlQrIp/r/w==')
250255

251256
def test_it_with_stubs(self):
252257
from gcloud._testing import _Monkey
253258
from gcloud.storage import _helpers as MUT
254259

260+
class _Buffer(object):
261+
262+
def __init__(self, return_vals):
263+
self.return_vals = return_vals
264+
self._block_sizes = []
265+
266+
def read(self, block_size):
267+
self._block_sizes.append(block_size)
268+
return self.return_vals.pop()
269+
255270
BASE64 = _Base64()
256271
DIGEST_VAL = object()
257-
BYTES_TO_SIGN = object()
272+
BYTES_TO_SIGN = b'BYTES_TO_SIGN'
273+
BUFFER = _Buffer([b'', BYTES_TO_SIGN])
258274
MD5 = _MD5(DIGEST_VAL)
259275

260276
with _Monkey(MUT, base64=BASE64, MD5=MD5):
261-
SIGNED_CONTENT = self._callFUT(BYTES_TO_SIGN)
277+
SIGNED_CONTENT = self._callFUT(BUFFER)
262278

279+
self.assertEqual(BUFFER._block_sizes, [8192, 8192])
263280
self.assertTrue(SIGNED_CONTENT is DIGEST_VAL)
264281
self.assertEqual(BASE64._called_b64encode, [DIGEST_VAL])
265-
self.assertEqual(MD5._new_called, [BYTES_TO_SIGN])
282+
self.assertEqual(MD5._new_called, [None])
266283
self.assertEqual(MD5.hash_obj.num_digest_calls, 1)
284+
self.assertEqual(MD5.hash_obj._blocks, [BYTES_TO_SIGN])
267285

268286

269287
class _Connection(object):
@@ -283,11 +301,16 @@ class _MD5Hash(object):
283301
def __init__(self, digest_val):
284302
self.digest_val = digest_val
285303
self.num_digest_calls = 0
304+
self._blocks = []
305+
306+
def update(self, block):
307+
self._blocks.append(block)
286308

287309
def digest(self):
288310
self.num_digest_calls += 1
289311
return self.digest_val
290312

313+
291314
class _MD5(object):
292315

293316
def __init__(self, digest_val):

regression/storage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def setUpClass(cls):
100100
super(TestStorageFiles, cls).setUpClass()
101101
for file_data in cls.FILES.values():
102102
with open(file_data['path'], 'rb') as file_obj:
103-
file_data['hash'] = _base64_md5hash(file_obj.read())
103+
file_data['hash'] = _base64_md5hash(file_obj)
104104
cls.bucket = SHARED_BUCKETS['test_bucket']
105105

106106
def setUp(self):

0 commit comments

Comments
 (0)