1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133 | import msgpack
from tlz import valmap
from dask.core import keys_in_tasks
from dask.highlevelgraph import HighLevelGraph, Layer
from ..utils_comm import unpack_remotedata, subs_multiple
from ..worker import dumps_task
from ..utils import (
str_graph,
tokey,
CancelledError,
)
from .utils import (
msgpack_opts,
)
from .serialize import (
import_allowed_module,
msgpack_encode_default,
msgpack_decode_default,
)
def _materialized_layer_pack(
layer: Layer,
all_keys,
known_key_dependencies,
client,
client_keys,
):
from ..client import Future
dsk = dict(layer)
# Find aliases not in `client_keys` and substitute all matching keys
# with its Future
values = {
k: v for k, v in dsk.items() if isinstance(v, Future) and k not in client_keys
}
if values:
dsk = subs_multiple(dsk, values)
# Unpack remote data re record its dependencies
dsk = {k: unpack_remotedata(v, byte_keys=True) for k, v in layer.items()}
unpacked_futures = set.union(*[v[1] for v in dsk.values()]) if dsk else set()
for future in unpacked_futures:
if future.client is not client:
raise ValueError(
"Inputs contain futures that were created by another client."
)
if tokey(future.key) not in client.futures:
raise CancelledError(tokey(future.key))
unpacked_futures_deps = {}
for k, v in dsk.items():
if len(v[1]):
unpacked_futures_deps[k] = {f.key for f in v[1]}
dsk = {k: v[0] for k, v in dsk.items()}
# Calculate dependencies without re-calculating already known dependencies
missing_keys = set(dsk.keys()).difference(known_key_dependencies.keys())
dependencies = {
k: keys_in_tasks(all_keys, [dsk[k]], as_list=False) for k in missing_keys
}
for k, v in unpacked_futures_deps.items():
dependencies[k] = set(dependencies.get(k, ())) | v
# The scheduler expect all keys to be strings
dependencies = {
tokey(k): [tokey(dep) for dep in deps] for k, deps in dependencies.items()
}
dsk = str_graph(dsk, extra_values=all_keys)
dsk = valmap(dumps_task, dsk)
return {"dsk": dsk, "dependencies": dependencies}
def highlevelgraph_pack(hlg: HighLevelGraph, client, client_keys):
layers = []
# Dump each layer (in topological order)
for layer in (hlg.layers[name] for name in hlg._toposort_layers()):
if not layer.is_materialized():
state = layer.__dask_distributed_pack__(client)
if state is not None:
layers.append(
{
"__module__": layer.__module__,
"__name__": type(layer).__name__,
"state": state,
}
)
continue
# Falling back to the default serialization, which will materialize the layer
layers.append(
{
"__module__": None,
"__name__": None,
"state": _materialized_layer_pack(
layer,
hlg.get_all_external_keys(),
hlg.key_dependencies,
client,
client_keys,
),
}
)
return msgpack.dumps({"layers": layers}, default=msgpack_encode_default)
def _materialized_layer_unpack(state, dsk, dependencies):
dsk.update(state["dsk"])
for k, v in state["dependencies"].items():
dependencies[k] = list(set(dependencies.get(k, ())) | set(v))
def highlevelgraph_unpack(dumped_hlg):
# Notice, we set `use_list=False`, which makes msgpack convert lists to tuples
hlg = msgpack.loads(
dumped_hlg, object_hook=msgpack_decode_default, use_list=False, **msgpack_opts
)
dsk = {}
deps = {}
for layer in hlg["layers"]:
if layer["__module__"] is None: # Default implementation
unpack_func = _materialized_layer_unpack
else:
mod = import_allowed_module(layer["__module__"])
unpack_func = getattr(mod, layer["__name__"]).__dask_distributed_unpack__
unpack_func(layer["state"], dsk, deps)
return dsk, deps
|