This repository was archived by the owner on Mar 31, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 174
Expand file tree
/
Copy pathtransfer_manager.py
More file actions
1471 lines (1201 loc) · 58.8 KB
/
transfer_manager.py
File metadata and controls
1471 lines (1201 loc) · 58.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Concurrent media operations."""
import concurrent.futures
import io
import inspect
import os
import warnings
import pickle
import copyreg
import struct
import base64
import functools
from pathlib import Path
from google.api_core import exceptions
from google.cloud.storage import Client
from google.cloud.storage import Blob
from google.cloud.storage.blob import _get_host_name
from google.cloud.storage.blob import _quote
from google.cloud.storage.constants import _DEFAULT_TIMEOUT
from google.cloud.storage.retry import DEFAULT_RETRY
import google_crc32c
from google.cloud.storage._media.requests.upload import XMLMPUContainer
from google.cloud.storage._media.requests.upload import XMLMPUPart
from google.cloud.storage.exceptions import DataCorruption, InvalidPathError
TM_DEFAULT_CHUNK_SIZE = 32 * 1024 * 1024
DEFAULT_MAX_WORKERS = 8
MAX_CRC32C_ZERO_ARRAY_SIZE = 4 * 1024 * 1024
METADATA_HEADER_TRANSLATION = {
"cacheControl": "Cache-Control",
"contentDisposition": "Content-Disposition",
"contentEncoding": "Content-Encoding",
"contentLanguage": "Content-Language",
"customTime": "x-goog-custom-time",
"storageClass": "x-goog-storage-class",
}
# Constants to be passed in as `worker_type`.
PROCESS = "process"
THREAD = "thread"
DOWNLOAD_CRC32C_MISMATCH_TEMPLATE = """\
Checksum mismatch while downloading:
{}
The object metadata indicated a crc32c checksum of:
{}
but the actual crc32c checksum of the downloaded contents was:
{}
"""
_cached_clients = {}
def _deprecate_threads_param(func):
@functools.wraps(func)
def convert_threads_or_raise(*args, **kwargs):
binding = inspect.signature(func).bind(*args, **kwargs)
threads = binding.arguments.get("threads")
if threads:
worker_type = binding.arguments.get("worker_type")
max_workers = binding.arguments.get("max_workers")
if worker_type or max_workers: # Parameter conflict
raise ValueError(
"The `threads` parameter is deprecated and conflicts with its replacement parameters, `worker_type` and `max_workers`."
)
# No conflict, so issue a warning and set worker_type and max_workers.
warnings.warn(
"The `threads` parameter is deprecated. Please use `worker_type` and `max_workers` parameters instead."
)
args = binding.args
kwargs = binding.kwargs
kwargs["worker_type"] = THREAD
kwargs["max_workers"] = threads
return func(*args, **kwargs)
else:
return func(*args, **kwargs)
return convert_threads_or_raise
@_deprecate_threads_param
def upload_many(
file_blob_pairs,
skip_if_exists=False,
upload_kwargs=None,
threads=None,
deadline=None,
raise_exception=False,
worker_type=PROCESS,
max_workers=DEFAULT_MAX_WORKERS,
):
"""Upload many files concurrently via a worker pool.
:type file_blob_pairs: List(Tuple(IOBase or str, 'google.cloud.storage.blob.Blob'))
:param file_blob_pairs:
A list of tuples of a file or filename and a blob. Each file will be
uploaded to the corresponding blob by using APIs identical to
`blob.upload_from_file()` or `blob.upload_from_filename()` as
appropriate.
File handlers are only supported if worker_type is set to THREAD.
If worker_type is set to PROCESS, please use filenames only.
:type skip_if_exists: bool
:param skip_if_exists:
If True, blobs that already have a live version will not be overwritten.
This is accomplished by setting `if_generation_match = 0` on uploads.
Uploads so skipped will result in a 412 Precondition Failed response
code, which will be included in the return value but not raised
as an exception regardless of the value of raise_exception.
:type upload_kwargs: dict
:param upload_kwargs:
A dictionary of keyword arguments to pass to the upload method. Refer
to the documentation for `blob.upload_from_file()` or
`blob.upload_from_filename()` for more information. The dict is directly
passed into the upload methods and is not validated by this function.
:type threads: int
:param threads:
***DEPRECATED*** Sets `worker_type` to THREAD and `max_workers` to the
number specified. If `worker_type` or `max_workers` are set explicitly,
this parameter should be set to None. Please use `worker_type` and
`max_workers` instead of this parameter.
:type deadline: int
:param deadline:
The number of seconds to wait for all threads to resolve. If the
deadline is reached, all threads will be terminated regardless of their
progress and `concurrent.futures.TimeoutError` will be raised. This can
be left as the default of `None` (no deadline) for most use cases.
:type raise_exception: bool
:param raise_exception:
If True, instead of adding exceptions to the list of return values,
instead they will be raised. Note that encountering an exception on one
operation will not prevent other operations from starting. Exceptions
are only processed and potentially raised after all operations are
complete in success or failure.
If skip_if_exists is True, 412 Precondition Failed responses are
considered part of normal operation and are not raised as an exception.
:type worker_type: str
:param worker_type:
The worker type to use; one of `google.cloud.storage.transfer_manager.PROCESS`
or `google.cloud.storage.transfer_manager.THREAD`.
Although the exact performance impact depends on the use case, in most
situations the PROCESS worker type will use more system resources (both
memory and CPU) and result in faster operations than THREAD workers.
Because the subprocesses of the PROCESS worker type can't access memory
from the main process, Client objects have to be serialized and then
recreated in each subprocess. The serialization of the Client object
for use in subprocesses is an approximation and may not capture every
detail of the Client object, especially if the Client was modified after
its initial creation or if `Client._http` was modified in any way.
THREAD worker types are observed to be relatively efficient for
operations with many small files, but not for operations with large
files. PROCESS workers are recommended for large file operations.
PROCESS workers do not support writing to file handlers. Please refer
to files by filename only when using PROCESS workers.
:type max_workers: int
:param max_workers:
The maximum number of workers to create to handle the workload.
With PROCESS workers, a larger number of workers will consume more
system resources (memory and CPU) at once.
How many workers is optimal depends heavily on the specific use case,
and the default is a conservative number that should work okay in most
cases without consuming excessive resources.
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
:rtype: list
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received, it will be the result
for that operation. Otherwise, the return value from the successful
upload method is used (which will be None).
"""
if upload_kwargs is None:
upload_kwargs = {}
if skip_if_exists:
upload_kwargs = upload_kwargs.copy()
upload_kwargs["if_generation_match"] = 0
upload_kwargs["command"] = "tm.upload_many"
pool_class, needs_pickling = _get_pool_class_and_requirements(worker_type)
with pool_class(max_workers=max_workers) as executor:
futures = []
for path_or_file, blob in file_blob_pairs:
# File objects are only supported by the THREAD worker because they can't
# be pickled.
if needs_pickling and not isinstance(path_or_file, str):
raise ValueError(
"Passing in a file object is only supported by the THREAD worker type. Please either select THREAD workers, or pass in filenames only."
)
futures.append(
executor.submit(
_call_method_on_maybe_pickled_blob,
_pickle_client(blob) if needs_pickling else blob,
(
"_handle_filename_and_upload"
if isinstance(path_or_file, str)
else "_prep_and_do_upload"
),
path_or_file,
**upload_kwargs,
)
)
concurrent.futures.wait(
futures, timeout=deadline, return_when=concurrent.futures.ALL_COMPLETED
)
results = []
for future in futures:
exp = future.exception()
# If raise_exception is False, don't call future.result()
if exp and not raise_exception:
results.append(exp)
# If skip_if_exists and the exception is PreconditionFailed, do same.
elif exp and skip_if_exists and isinstance(exp, exceptions.PreconditionFailed):
results.append(exp)
# Get the real result. If there was an exception not handled above,
# this will raise it.
else:
results.append(future.result())
return results
def _resolve_path(target_dir, blob_path):
if os.name == "nt" and ":" in blob_path:
raise InvalidPathError(f"{blob_path} cannot be downloaded into {target_dir}")
target_dir = Path(target_dir)
blob_path = Path(blob_path)
# blob_path.anchor will be '/' if `blob_path` is full path else it'll empty.
# This is useful to concatnate target_dir = /local/target , and blob_path =
# /usr/local/mybin into /local/target/usr/local/mybin
concatenated_path = target_dir / blob_path.relative_to(blob_path.anchor)
return concatenated_path.resolve()
@_deprecate_threads_param
def download_many(
blob_file_pairs,
download_kwargs=None,
threads=None,
deadline=None,
raise_exception=False,
worker_type=PROCESS,
max_workers=DEFAULT_MAX_WORKERS,
*,
skip_if_exists=False,
):
"""Download many blobs concurrently via a worker pool.
:type blob_file_pairs: List(Tuple('google.cloud.storage.blob.Blob', IOBase or str))
:param blob_file_pairs:
A list of tuples of blob and a file or filename. Each blob will be downloaded to the corresponding blob by using APIs identical to blob.download_to_file() or blob.download_to_filename() as appropriate.
Note that blob.download_to_filename() does not delete the destination file if the download fails.
File handlers are only supported if worker_type is set to THREAD.
If worker_type is set to PROCESS, please use filenames only.
:type download_kwargs: dict
:param download_kwargs:
A dictionary of keyword arguments to pass to the download method. Refer
to the documentation for `blob.download_to_file()` or
`blob.download_to_filename()` for more information. The dict is directly
passed into the download methods and is not validated by this function.
:type threads: int
:param threads:
***DEPRECATED*** Sets `worker_type` to THREAD and `max_workers` to the
number specified. If `worker_type` or `max_workers` are set explicitly,
this parameter should be set to None. Please use `worker_type` and
`max_workers` instead of this parameter.
:type deadline: int
:param deadline:
The number of seconds to wait for all threads to resolve. If the
deadline is reached, all threads will be terminated regardless of their
progress and `concurrent.futures.TimeoutError` will be raised. This can
be left as the default of `None` (no deadline) for most use cases.
:type raise_exception: bool
:param raise_exception:
If True, instead of adding exceptions to the list of return values,
instead they will be raised. Note that encountering an exception on one
operation will not prevent other operations from starting. Exceptions
are only processed and potentially raised after all operations are
complete in success or failure.
:type worker_type: str
:param worker_type:
The worker type to use; one of `google.cloud.storage.transfer_manager.PROCESS`
or `google.cloud.storage.transfer_manager.THREAD`.
Although the exact performance impact depends on the use case, in most
situations the PROCESS worker type will use more system resources (both
memory and CPU) and result in faster operations than THREAD workers.
Because the subprocesses of the PROCESS worker type can't access memory
from the main process, Client objects have to be serialized and then
recreated in each subprocess. The serialization of the Client object
for use in subprocesses is an approximation and may not capture every
detail of the Client object, especially if the Client was modified after
its initial creation or if `Client._http` was modified in any way.
THREAD worker types are observed to be relatively efficient for
operations with many small files, but not for operations with large
files. PROCESS workers are recommended for large file operations.
PROCESS workers do not support writing to file handlers. Please refer
to files by filename only when using PROCESS workers.
:type max_workers: int
:param max_workers:
The maximum number of workers to create to handle the workload.
With PROCESS workers, a larger number of workers will consume more
system resources (memory and CPU) at once.
How many workers is optimal depends heavily on the specific use case,
and the default is a conservative number that should work okay in most
cases without consuming excessive resources.
:type skip_if_exists: bool
:param skip_if_exists:
Before downloading each blob, check if the file for the filename exists;
if it does, skip that blob.
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
:rtype: list
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received, it will be the result
for that operation. Otherwise, the return value from the successful
download method is used (which will be None).
"""
if download_kwargs is None:
download_kwargs = {}
download_kwargs["command"] = "tm.download_many"
pool_class, needs_pickling = _get_pool_class_and_requirements(worker_type)
with pool_class(max_workers=max_workers) as executor:
futures = []
for blob, path_or_file in blob_file_pairs:
# File objects are only supported by the THREAD worker because they can't
# be pickled.
if needs_pickling and not isinstance(path_or_file, str):
raise ValueError(
"Passing in a file object is only supported by the THREAD worker type. Please either select THREAD workers, or pass in filenames only."
)
if skip_if_exists and isinstance(path_or_file, str):
if os.path.isfile(path_or_file):
continue
futures.append(
executor.submit(
_call_method_on_maybe_pickled_blob,
_pickle_client(blob) if needs_pickling else blob,
(
"_handle_filename_and_download"
if isinstance(path_or_file, str)
else "_prep_and_do_download"
),
path_or_file,
**download_kwargs,
)
)
concurrent.futures.wait(
futures, timeout=deadline, return_when=concurrent.futures.ALL_COMPLETED
)
results = []
for future in futures:
# If raise_exception is False, don't call future.result()
if not raise_exception:
exp = future.exception()
if exp:
results.append(exp)
continue
# Get the real result. If there was an exception, this will raise it.
results.append(future.result())
return results
@_deprecate_threads_param
def upload_many_from_filenames(
bucket,
filenames,
source_directory="",
blob_name_prefix="",
skip_if_exists=False,
blob_constructor_kwargs=None,
upload_kwargs=None,
threads=None,
deadline=None,
raise_exception=False,
worker_type=PROCESS,
max_workers=DEFAULT_MAX_WORKERS,
*,
additional_blob_attributes=None,
):
"""Upload many files concurrently by their filenames.
The destination blobs are automatically created, with blob names based on
the source filenames and the blob_name_prefix.
For example, if the `filenames` include "images/icon.jpg",
`source_directory` is "/home/myuser/", and `blob_name_prefix` is "myfiles/",
then the file at "/home/myuser/images/icon.jpg" will be uploaded to a blob
named "myfiles/images/icon.jpg".
:type bucket: :class:`google.cloud.storage.bucket.Bucket`
:param bucket:
The bucket which will contain the uploaded blobs.
:type filenames: list(str)
:param filenames:
A list of filenames to be uploaded. This may include part of the path.
The file will be accessed at the full path of `source_directory` +
`filename`.
:type source_directory: str
:param source_directory:
A string that will be prepended (with `os.path.join()`) to each filename
in the input list, in order to find the source file for each blob.
Unlike the filename itself, the source_directory does not affect the
name of the uploaded blob.
For instance, if the source_directory is "/tmp/img/" and a filename is
"0001.jpg", with an empty blob_name_prefix, then the file uploaded will
be "/tmp/img/0001.jpg" and the destination blob will be "0001.jpg".
This parameter can be an empty string.
Note that this parameter allows directory traversal (e.g. "/", "../")
and is not intended for unsanitized end user input.
:type blob_name_prefix: str
:param blob_name_prefix:
A string that will be prepended to each filename in the input list, in
order to determine the name of the destination blob. Unlike the filename
itself, the prefix string does not affect the location the library will
look for the source data on the local filesystem.
For instance, if the source_directory is "/tmp/img/", the
blob_name_prefix is "myuser/mystuff-" and a filename is "0001.jpg" then
the file uploaded will be "/tmp/img/0001.jpg" and the destination blob
will be "myuser/mystuff-0001.jpg".
The blob_name_prefix can be blank (an empty string).
:type skip_if_exists: bool
:param skip_if_exists:
If True, blobs that already have a live version will not be overwritten.
This is accomplished by setting `if_generation_match = 0` on uploads.
Uploads so skipped will result in a 412 Precondition Failed response
code, which will be included in the return value, but not raised
as an exception regardless of the value of raise_exception.
:type blob_constructor_kwargs: dict
:param blob_constructor_kwargs:
A dictionary of keyword arguments to pass to the blob constructor. Refer
to the documentation for `blob.Blob()` for more information. The dict is
directly passed into the constructor and is not validated by this
function. `name` and `bucket` keyword arguments are reserved by this
function and will result in an error if passed in here.
:type upload_kwargs: dict
:param upload_kwargs:
A dictionary of keyword arguments to pass to the upload method. Refer
to the documentation for `blob.upload_from_file()` or
`blob.upload_from_filename()` for more information. The dict is directly
passed into the upload methods and is not validated by this function.
:type threads: int
:param threads:
***DEPRECATED*** Sets `worker_type` to THREAD and `max_workers` to the
number specified. If `worker_type` or `max_workers` are set explicitly,
this parameter should be set to None. Please use `worker_type` and
`max_workers` instead of this parameter.
:type deadline: int
:param deadline:
The number of seconds to wait for all threads to resolve. If the
deadline is reached, all threads will be terminated regardless of their
progress and `concurrent.futures.TimeoutError` will be raised. This can
be left as the default of `None` (no deadline) for most use cases.
:type raise_exception: bool
:param raise_exception:
If True, instead of adding exceptions to the list of return values,
instead they will be raised. Note that encountering an exception on one
operation will not prevent other operations from starting. Exceptions
are only processed and potentially raised after all operations are
complete in success or failure.
If skip_if_exists is True, 412 Precondition Failed responses are
considered part of normal operation and are not raised as an exception.
:type worker_type: str
:param worker_type:
The worker type to use; one of `google.cloud.storage.transfer_manager.PROCESS`
or `google.cloud.storage.transfer_manager.THREAD`.
Although the exact performance impact depends on the use case, in most
situations the PROCESS worker type will use more system resources (both
memory and CPU) and result in faster operations than THREAD workers.
Because the subprocesses of the PROCESS worker type can't access memory
from the main process, Client objects have to be serialized and then
recreated in each subprocess. The serialization of the Client object
for use in subprocesses is an approximation and may not capture every
detail of the Client object, especially if the Client was modified after
its initial creation or if `Client._http` was modified in any way.
THREAD worker types are observed to be relatively efficient for
operations with many small files, but not for operations with large
files. PROCESS workers are recommended for large file operations.
:type max_workers: int
:param max_workers:
The maximum number of workers to create to handle the workload.
With PROCESS workers, a larger number of workers will consume more
system resources (memory and CPU) at once.
How many workers is optimal depends heavily on the specific use case,
and the default is a conservative number that should work okay in most
cases without consuming excessive resources.
:type additional_blob_attributes: dict
:param additional_blob_attributes:
A dictionary of blob attribute names and values. This allows the
configuration of blobs beyond what is possible with
blob_constructor_kwargs. For instance, {"cache_control": "no-cache"}
would set the cache_control attribute of each blob to "no-cache".
As with blob_constructor_kwargs, this affects the creation of every
blob identically. To fine-tune each blob individually, use `upload_many`
and create the blobs as desired before passing them in.
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
:rtype: list
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received, it will be the result
for that operation. Otherwise, the return value from the successful
upload method is used (which will be None).
"""
if blob_constructor_kwargs is None:
blob_constructor_kwargs = {}
if additional_blob_attributes is None:
additional_blob_attributes = {}
file_blob_pairs = []
for filename in filenames:
path = os.path.join(source_directory, filename)
blob_name = blob_name_prefix + filename
blob = bucket.blob(blob_name, **blob_constructor_kwargs)
for prop, value in additional_blob_attributes.items():
setattr(blob, prop, value)
file_blob_pairs.append((path, blob))
return upload_many(
file_blob_pairs,
skip_if_exists=skip_if_exists,
upload_kwargs=upload_kwargs,
deadline=deadline,
raise_exception=raise_exception,
worker_type=worker_type,
max_workers=max_workers,
)
@_deprecate_threads_param
def download_many_to_path(
bucket,
blob_names,
destination_directory="",
blob_name_prefix="",
download_kwargs=None,
threads=None,
deadline=None,
create_directories=True,
raise_exception=False,
worker_type=PROCESS,
max_workers=DEFAULT_MAX_WORKERS,
*,
skip_if_exists=False,
):
"""Download many files concurrently by their blob names.
The destination files are automatically created, with paths based on the
source `blob_names` and the `destination_directory`.
The destination files are not automatically deleted if their downloads fail,
so please check the return value of this function for any exceptions, or
enable `raise_exception=True`, and process the files accordingly.
For example, if the `blob_names` include "icon.jpg", `destination_directory`
is "/home/myuser/", and `blob_name_prefix` is "images/", then the blob named
"images/icon.jpg" will be downloaded to a file named
"/home/myuser/icon.jpg".
Note1: if the path after combining `blob_name` and `destination_directory`
resolves outside `destination_directory` a warning will be issued and the
that particular blob will NOT be downloaded. This may happen in scenarios
where `blob_name` contains "../"
For example,
consider `destination_directory` is "downloads/gcs_blobs" and
`blob_name` is '../hello.blob'. This blob will not be downloaded
because the final resolved path would be "downloads/hello.blob"
To give further examples, the following blobs will not be downloaded because
it "escapes" the "destination_directory"
"../../local/target", # skips download
"../escape.txt", # skips download
"go/four/levels/deep/../../../../../somefile1", # skips download
"go/four/levels/deep/../some_dir/../../../../../invalid/path1" # skips download
however the following blobs will be downloaded because the final resolved
destination_directory is still child of given destination_directory
"data/../sibling.txt",
"dir/./file.txt",
"go/four/levels/deep/../somefile2",
"go/four/levels/deep/../some_dir/valid/path1",
"go/four/levels/deep/../some_dir/../../../../valid/path2",
It is adviced to use other APIs such as `transfer_manager.download_many` or
`Blob.download_to_filename` or `Blob.download_to_file` to download such blobs.
Note2:
The resolved download_directory will always be relative to user provided
`destination_directory`. For example,
a `blob_name` "/etc/passwd" will be downloaded into
"destination_directory/etc/passwd" instead of "/etc/passwd"
Similarly,
"/tmp/my_fav_blob" downloads to "destination_directory/tmp/my_fav_blob"
:type bucket: :class:`google.cloud.storage.bucket.Bucket`
:param bucket:
The bucket which contains the blobs to be downloaded
:type blob_names: list(str)
:param blob_names:
A list of blobs to be downloaded. The blob name in this string will be
used to determine the destination file path as well.
The full name to the blob must be blob_name_prefix + blob_name. The
blob_name is separate from the blob_name_prefix because the blob_name
will also determine the name of the destination blob. Any shared part of
the blob names that need not be part of the destination path should be
included in the blob_name_prefix.
:type destination_directory: str
:param destination_directory:
A string that will be prepended to each blob_name in the input list, in
order to determine the destination path for that blob.
For instance, if the destination_directory string is "/tmp/img" and a
blob_name is "0001.jpg", with an empty blob_name_prefix, then the source
blob "0001.jpg" will be downloaded to destination "/tmp/img/0001.jpg" .
This parameter can be an empty string.
Note directory traversal may be possible as long as the final
(e.g. "/", "../") resolved path is inside "destination_directory".
See examples above.
:type blob_name_prefix: str
:param blob_name_prefix:
A string that will be prepended to each blob_name in the input list, in
order to determine the name of the source blob. Unlike the blob_name
itself, the prefix string does not affect the destination path on the
local filesystem. For instance, if the destination_directory is
"/tmp/img/", the blob_name_prefix is "myuser/mystuff-" and a blob_name
is "0001.jpg" then the source blob "myuser/mystuff-0001.jpg" will be
downloaded to "/tmp/img/0001.jpg". The blob_name_prefix can be blank
(an empty string).
:type download_kwargs: dict
:param download_kwargs:
A dictionary of keyword arguments to pass to the download method. Refer
to the documentation for `blob.download_to_file()` or
`blob.download_to_filename()` for more information. The dict is directly
passed into the download methods and is not validated by this function.
:type threads: int
:param threads:
***DEPRECATED*** Sets `worker_type` to THREAD and `max_workers` to the
number specified. If `worker_type` or `max_workers` are set explicitly,
this parameter should be set to None. Please use `worker_type` and
`max_workers` instead of this parameter.
:type deadline: int
:param deadline:
The number of seconds to wait for all threads to resolve. If the
deadline is reached, all threads will be terminated regardless of their
progress and `concurrent.futures.TimeoutError` will be raised. This can
be left as the default of `None` (no deadline) for most use cases.
:type create_directories: bool
:param create_directories:
If True, recursively create any directories that do not exist. For
instance, if downloading object "images/img001.png", create the
directory "images" before downloading.
:type raise_exception: bool
:param raise_exception:
If True, instead of adding exceptions to the list of return values,
instead they will be raised. Note that encountering an exception on one
operation will not prevent other operations from starting. Exceptions
are only processed and potentially raised after all operations are
complete in success or failure. If skip_if_exists is True, 412
Precondition Failed responses are considered part of normal operation
and are not raised as an exception.
:type worker_type: str
:param worker_type:
The worker type to use; one of `google.cloud.storage.transfer_manager.PROCESS`
or `google.cloud.storage.transfer_manager.THREAD`.
Although the exact performance impact depends on the use case, in most
situations the PROCESS worker type will use more system resources (both
memory and CPU) and result in faster operations than THREAD workers.
Because the subprocesses of the PROCESS worker type can't access memory
from the main process, Client objects have to be serialized and then
recreated in each subprocess. The serialization of the Client object
for use in subprocesses is an approximation and may not capture every
detail of the Client object, especially if the Client was modified after
its initial creation or if `Client._http` was modified in any way.
THREAD worker types are observed to be relatively efficient for
operations with many small files, but not for operations with large
files. PROCESS workers are recommended for large file operations.
:type max_workers: int
:param max_workers:
The maximum number of workers to create to handle the workload.
With PROCESS workers, a larger number of workers will consume more
system resources (memory and CPU) at once.
How many workers is optimal depends heavily on the specific use case,
and the default is a conservative number that should work okay in most
cases without consuming excessive resources.
:type skip_if_exists: bool
:param skip_if_exists:
Before downloading each blob, check if the file for the filename exists;
if it does, skip that blob. This only works for filenames.
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
:rtype: List[None|Exception|UserWarning]
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received or a download was skipped
(e.g., due to existing file or path traversal), it will be the result
for that operation (as an Exception or UserWarning, respectively).
Otherwise, the result will be None for a successful download.
"""
results = [None] * len(blob_names)
blob_file_pairs = []
indices_to_process = []
for i, blob_name in enumerate(blob_names):
full_blob_name = blob_name_prefix + blob_name
try:
resolved_path = _resolve_path(destination_directory, blob_name)
except InvalidPathError as e:
msg = f"The blob {blob_name} will **NOT** be downloaded. {e}"
warnings.warn(msg)
results[i] = UserWarning(msg)
continue
if not resolved_path.parent.is_relative_to(
Path(destination_directory).resolve()
):
msg = (
f"The blob {blob_name} will **NOT** be downloaded. "
f"The resolved destination_directory - {resolved_path.parent} - is either invalid or "
f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`"
)
warnings.warn(msg)
results[i] = UserWarning(msg)
continue
resolved_path = str(resolved_path)
if skip_if_exists and os.path.isfile(resolved_path):
msg = f"The blob {blob_name} is skipped because destination file already exists"
results[i] = UserWarning(msg)
continue
if create_directories:
directory, _ = os.path.split(resolved_path)
os.makedirs(directory, exist_ok=True)
blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path))
indices_to_process.append(i)
many_results = download_many(
blob_file_pairs,
download_kwargs=download_kwargs,
deadline=deadline,
raise_exception=raise_exception,
worker_type=worker_type,
max_workers=max_workers,
skip_if_exists=False, # skip_if_exists is handled in the loop above
)
for meta_index, result in zip(indices_to_process, many_results):
results[meta_index] = result
return results
def download_chunks_concurrently(
blob,
filename,
chunk_size=TM_DEFAULT_CHUNK_SIZE,
download_kwargs=None,
deadline=None,
worker_type=PROCESS,
max_workers=DEFAULT_MAX_WORKERS,
*,
crc32c_checksum=True,
):
"""Download a single file in chunks, concurrently.
In some environments, using this feature with mutiple processes will result
in faster downloads of large files.
Using this feature with multiple threads is unlikely to improve download
performance under normal circumstances due to Python interpreter threading
behavior. The default is therefore to use processes instead of threads.
:type blob: :class:`google.cloud.storage.blob.Blob`
:param blob:
The blob to be downloaded.
:type filename: str
:param filename:
The destination filename or path.
:type chunk_size: int
:param chunk_size:
The size in bytes of each chunk to send. The optimal chunk size for
maximum throughput may vary depending on the exact network environment
and size of the blob.
:type download_kwargs: dict
:param download_kwargs:
A dictionary of keyword arguments to pass to the download method. Refer
to the documentation for `blob.download_to_file()` or
`blob.download_to_filename()` for more information. The dict is directly
passed into the download methods and is not validated by this function.
Keyword arguments "start" and "end" which are not supported and will
cause a ValueError if present. The key "checksum" is also not supported
in `download_kwargs`, but see the argument `crc32c_checksum` (which does
not go in `download_kwargs`) below.
:type deadline: int
:param deadline:
The number of seconds to wait for all threads to resolve. If the
deadline is reached, all threads will be terminated regardless of their
progress and `concurrent.futures.TimeoutError` will be raised. This can
be left as the default of `None` (no deadline) for most use cases.
:type worker_type: str
:param worker_type:
The worker type to use; one of `google.cloud.storage.transfer_manager.PROCESS`
or `google.cloud.storage.transfer_manager.THREAD`.
Although the exact performance impact depends on the use case, in most
situations the PROCESS worker type will use more system resources (both
memory and CPU) and result in faster operations than THREAD workers.
Because the subprocesses of the PROCESS worker type can't access memory
from the main process, Client objects have to be serialized and then
recreated in each subprocess. The serialization of the Client object
for use in subprocesses is an approximation and may not capture every
detail of the Client object, especially if the Client was modified after
its initial creation or if `Client._http` was modified in any way.
THREAD worker types are observed to be relatively efficient for
operations with many small files, but not for operations with large
files. PROCESS workers are recommended for large file operations.
:type max_workers: int
:param max_workers:
The maximum number of workers to create to handle the workload.
With PROCESS workers, a larger number of workers will consume more
system resources (memory and CPU) at once.
How many workers is optimal depends heavily on the specific use case,
and the default is a conservative number that should work okay in most
cases without consuming excessive resources.
:type crc32c_checksum: bool
:param crc32c_checksum:
Whether to compute a checksum for the resulting object, using the crc32c
algorithm. As the checksums for each chunk must be combined using a
feature of crc32c that is not available for md5, md5 is not supported.
:raises:
:exc:`concurrent.futures.TimeoutError`
if deadline is exceeded.
:exc:`google.cloud.storage._media.common.DataCorruption`
if the download's checksum doesn't agree with server-computed
checksum. The `google.cloud.storage._media` exception is used here for
consistency with other download methods despite the exception
originating elsewhere.
"""
client = blob.client
if download_kwargs is None:
download_kwargs = {}
if "start" in download_kwargs or "end" in download_kwargs:
raise ValueError(
"Download arguments 'start' and 'end' are not supported by download_chunks_concurrently."
)
if "checksum" in download_kwargs:
raise ValueError(
"'checksum' is in download_kwargs, but is not supported because sliced downloads have a different checksum mechanism from regular downloads. Use the 'crc32c_checksum' argument on download_chunks_concurrently instead."
)
download_kwargs = download_kwargs.copy()
download_kwargs["checksum"] = None
download_kwargs["command"] = "tm.download_sharded"
# We must know the size and the generation of the blob.
if not blob.size or not blob.generation:
blob.reload()
pool_class, needs_pickling = _get_pool_class_and_requirements(worker_type)
# Pickle the blob ahead of time (just once, not once per chunk) if needed.
maybe_pickled_blob = _pickle_client(blob) if needs_pickling else blob
futures = []
# Create and/or truncate the destination file to prepare for sparse writing.
with open(filename, "wb") as _:
pass
with pool_class(max_workers=max_workers) as executor: