deepreg.train - Code Metrics - Inspection of "Issue #720: enable to define num_parallel_calls in..." - DeepRegNet/DeepReg - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#724)

by Yunguan

created 2021-03-27 15:30 UTC

deepreg.train A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	275
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	9
eloc	158
dl	0
loc	275
rs	10
c	0
b	0
f	0

3 Functions

Rating	Name	Size	Complexity
A	build_config()	43	2
C	train()	119	6
B	main()	84	1

# coding=utf-8

"""
Module to train a network using init files and a CLI.
"""

import argparse
import os
from typing import Dict, List, Tuple, Union

import tensorflow as tf

import deepreg.config.parser as config_parser
import deepreg.model.optimizer as opt
from deepreg.callback import build_checkpoint_callback
from deepreg.registry import REGISTRY
from deepreg.util import build_dataset, build_log_dir


def build_config(
    config_path: Union[str, List[str]],
    log_dir: str,
    exp_name: str,
    ckpt_path: str,
    max_epochs: int = -1,
) -> Tuple[Dict, str, str]:
    """
    Function to initialise log directories,
    assert that checkpointed model is the right
    type and to parse the configuration for training.

    :param config_path: list of str, path to config file
    :param log_dir: path of the log directory
    :param exp_name: name of the experiment
    :param ckpt_path: path where model is stored.
    :param max_epochs: if max_epochs > 0, use it to overwrite the configuration
    :return: - config: a dictionary saving configuration
             - exp_name: the path of directory to save logs
    """

    # init log directory
    log_dir = build_log_dir(log_dir=log_dir, exp_name=exp_name)

    # load config
    config = config_parser.load_configs(config_path)

    # replace the ~ with user home path
    ckpt_path = os.path.expanduser(ckpt_path)

    # overwrite epochs and save_period if necessary
    if max_epochs > 0:
        config["train"]["epochs"] = max_epochs
        config["train"]["save_period"] = min(max_epochs, config["train"]["save_period"])

    # backup config
    config_parser.save(config=config, out_dir=log_dir)

    # batch_size in original config corresponds to batch_size per GPU
    gpus = tf.config.experimental.list_physical_devices("GPU")
    config["train"]["preprocess"]["batch_size"] *= max(len(gpus), 1)

    return config, log_dir, ckpt_path


def train(
    gpu: str,
    config_path: Union[str, List[str]],
    ckpt_path: str,
    num_cpus: int = -1,
    gpu_allow_growth: bool = True,
    exp_name: str = "",
    log_dir: str = "logs",
    max_epochs: int = -1,
):
    """
    Function to train a model.

    :param gpu: which local gpu to use to train.
    :param config_path: path to configuration set up.
    :param ckpt_path: where to store training checkpoints.
    :param num_cpus: number of cpus to be used, -1 means not limited.
    :param gpu_allow_growth: whether to allocate whole GPU memory for training.
    :param log_dir: path of the log directory.
    :param exp_name: experiment name.
    :param max_epochs: if max_epochs > 0, will use it to overwrite the configuration.
    """
    # set env variables
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" if gpu_allow_growth else "false"
    if num_cpus > 0:
        # Maximum number of threads to use for OpenMP parallel regions.
        os.environ["OMP_NUM_THREADS"] = str(num_cpus)
        # Without setting below 2 environment variables, it didn't work for me. Thanks to @cjw85
        os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_cpus)
        os.environ["TF_NUM_INTEROP_THREADS"] = str(num_cpus)

    # load config
    config, log_dir, ckpt_path = build_config(
        config_path=config_path,
        log_dir=log_dir,
        exp_name=exp_name,
        ckpt_path=ckpt_path,
        max_epochs=max_epochs,
    )

    # build dataset
    data_loader_train, dataset_train, steps_per_epoch_train = build_dataset(
        dataset_config=config["dataset"],
        preprocess_config=config["train"]["preprocess"],
        mode="train",
        training=True,
        repeat=True,
    )
    assert data_loader_train is not None  # train data should not be None
    data_loader_val, dataset_val, steps_per_epoch_val = build_dataset(
        dataset_config=config["dataset"],
        preprocess_config=config["train"]["preprocess"],
        mode="valid",
        training=False,
        repeat=True,
    )

    # use strategy to support multiple GPUs
    # the network is mirrored in each GPU so that we can use larger batch size
    # https://www.tensorflow.org/guide/distributed_training
    # only model, optimizer and metrics need to be defined inside the strategy
    num_devices = max(len(tf.config.list_physical_devices("GPU")), 1)
    if num_devices > 1:
        strategy = tf.distribute.MirroredStrategy()  # pragma: no cover
    else:
        strategy = tf.distribute.get_strategy()
    with strategy.scope():
        model: tf.keras.Model = REGISTRY.build_model(
            config=dict(
                name=config["train"]["method"],
                moving_image_size=data_loader_train.moving_image_shape,
                fixed_image_size=data_loader_train.fixed_image_shape,
                index_size=data_loader_train.num_indices,
                labeled=config["dataset"]["labeled"],
                batch_size=config["train"]["preprocess"]["batch_size"],
                config=config["train"],
                num_devices=num_devices,
            )
        )
        optimizer = opt.build_optimizer(optimizer_config=config["train"]["optimizer"])

    # compile
    model.compile(optimizer=optimizer)
    model.plot_model(output_dir=log_dir)

    # build callbacks
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir,
        histogram_freq=config["train"]["save_period"],
        update_freq=config["train"].get("update_freq", "epoch"),
    )
    ckpt_callback, initial_epoch = build_checkpoint_callback(
        model=model,
        dataset=dataset_train,
        log_dir=log_dir,
        save_period=config["train"]["save_period"],
        ckpt_path=ckpt_path,
    )
    callbacks = [tensorboard_callback, ckpt_callback]

    # train
    # it's necessary to define the steps_per_epoch
    # and validation_steps to prevent errors like
    # BaseCollectiveExecutor::StartAbort Out of range: End of sequence
    model.fit(
        x=dataset_train,
        steps_per_epoch=steps_per_epoch_train,
        initial_epoch=initial_epoch,
        epochs=config["train"]["epochs"],
        validation_data=dataset_val,
        validation_steps=steps_per_epoch_val,
        callbacks=callbacks,
    )

    # close file loaders in data loaders after training
    data_loader_train.close()
    if data_loader_val is not None:
        data_loader_val.close()


def main(args=None):
    """
    Entry point for train script.

    :param args: arguments
    """

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--gpu",
        "-g",
        help="GPU index for training."
        '-g "" for using CPU'
        '-g "0" for using GPU 0'
        '-g "0,1" for using GPU 0 and 1.',
        type=str,
        required=True,
    )

    parser.add_argument(
        "--gpu_allow_growth",
        "-gr",
        help="Prevent TensorFlow from reserving all available GPU memory",
        default=False,
    )

    parser.add_argument(
        "--num_cpus",
        help="Number of CPUs to be used, -1 means unlimited.",
        type=int,
        default=-1,
    )

    parser.add_argument(
        "--ckpt_path",
        "-k",
        help="Path of the saved model checkpoint to load."
        "No need to provide if start training from scratch.",
        default="",
        type=str,
        required=False,
    )

    parser.add_argument(
        "--log_dir", help="Path of log directory.", default="logs", type=str
    )

    parser.add_argument(
        "--exp_name",
        "-l",
        help="Name of log directory."
        "The directory is under log root, e.g. logs/ by default."
        "If not provided, a timestamp based folder will be created.",
        default="",
        type=str,
    )

    parser.add_argument(
        "--config_path",
        "-c",
        help="Path of config, must end with .yaml. Can pass multiple paths.",
        type=str,
        nargs="+",
        required=True,
    )

    parser.add_argument(
        "--max_epochs",
        help="The maximum number of epochs, -1 means following configuration.",
        type=int,
        default=-1,
    )

    args = parser.parse_args(args)
    train(
        gpu=args.gpu,
        config_path=args.config_path,
        num_cpus=args.num_cpus,
        gpu_allow_growth=args.gpu_allow_growth,
        ckpt_path=args.ckpt_path,
        log_dir=args.log_dir,
        exp_name=args.exp_name,
        max_epochs=args.max_epochs,
    )


if __name__ == "__main__":
    main()  # pragma: no cover


1			# coding=utf-8
2
3			"""
4			Module to train a network using init files and a CLI.
5			"""
6
7			import argparse
8			import os
9			from typing import Dict, List, Tuple, Union
10
11			import tensorflow as tf
12
13			import deepreg.config.parser as config_parser
14			import deepreg.model.optimizer as opt
15			from deepreg.callback import build_checkpoint_callback
16			from deepreg.registry import REGISTRY
17			from deepreg.util import build_dataset, build_log_dir
18
19
20			def build_config(
21			config_path: Union[str, List[str]],
22			log_dir: str,
23			exp_name: str,
24			ckpt_path: str,
25			max_epochs: int = -1,
26			) -> Tuple[Dict, str, str]:
27			"""
28			Function to initialise log directories,
29			assert that checkpointed model is the right
30			type and to parse the configuration for training.
31
32			:param config_path: list of str, path to config file
33			:param log_dir: path of the log directory
34			:param exp_name: name of the experiment
35			:param ckpt_path: path where model is stored.
36			:param max_epochs: if max_epochs > 0, use it to overwrite the configuration
37			:return: - config: a dictionary saving configuration
38			- exp_name: the path of directory to save logs
39			"""
40
41			# init log directory
42			log_dir = build_log_dir(log_dir=log_dir, exp_name=exp_name)
43
44			# load config
45			config = config_parser.load_configs(config_path)
46
47			# replace the ~ with user home path
48			ckpt_path = os.path.expanduser(ckpt_path)
49
50			# overwrite epochs and save_period if necessary
51			if max_epochs > 0:
52			config["train"]["epochs"] = max_epochs
53			config["train"]["save_period"] = min(max_epochs, config["train"]["save_period"])
54
55			# backup config
56			config_parser.save(config=config, out_dir=log_dir)
57
58			# batch_size in original config corresponds to batch_size per GPU
59			gpus = tf.config.experimental.list_physical_devices("GPU")
60			config["train"]["preprocess"]["batch_size"] *= max(len(gpus), 1)
61
62			return config, log_dir, ckpt_path
63
64
65			def train(
66			gpu: str,
67			config_path: Union[str, List[str]],
68			ckpt_path: str,
69			num_cpus: int = -1,
70			gpu_allow_growth: bool = True,
71			exp_name: str = "",
72			log_dir: str = "logs",
73			max_epochs: int = -1,
74			):
75			"""
76			Function to train a model.
77
78			:param gpu: which local gpu to use to train.
79			:param config_path: path to configuration set up.
80			:param ckpt_path: where to store training checkpoints.
81			:param num_cpus: number of cpus to be used, -1 means not limited.
82			:param gpu_allow_growth: whether to allocate whole GPU memory for training.
83			:param log_dir: path of the log directory.
84			:param exp_name: experiment name.
85			:param max_epochs: if max_epochs > 0, will use it to overwrite the configuration.
86			"""
87			# set env variables
88			os.environ["CUDA_VISIBLE_DEVICES"] = gpu
89			os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" if gpu_allow_growth else "false"
90			if num_cpus > 0:
91			# Maximum number of threads to use for OpenMP parallel regions.
92			os.environ["OMP_NUM_THREADS"] = str(num_cpus)
93			# Without setting below 2 environment variables, it didn't work for me. Thanks to @cjw85
94			os.environ["TF_NUM_INTRAOP_THREADS"] = str(num_cpus)
95			os.environ["TF_NUM_INTEROP_THREADS"] = str(num_cpus)
96
97			# load config
98			config, log_dir, ckpt_path = build_config(
99			config_path=config_path,
100			log_dir=log_dir,
101			exp_name=exp_name,
102			ckpt_path=ckpt_path,
103			max_epochs=max_epochs,
104			)
105
106			# build dataset
107			data_loader_train, dataset_train, steps_per_epoch_train = build_dataset(
108			dataset_config=config["dataset"],
109			preprocess_config=config["train"]["preprocess"],
110			mode="train",
111			training=True,
112			repeat=True,
113			)
114			assert data_loader_train is not None # train data should not be None
115			data_loader_val, dataset_val, steps_per_epoch_val = build_dataset(
116			dataset_config=config["dataset"],
117			preprocess_config=config["train"]["preprocess"],
118			mode="valid",
119			training=False,
120			repeat=True,
121			)
122
123			# use strategy to support multiple GPUs
124			# the network is mirrored in each GPU so that we can use larger batch size
125			# https://www.tensorflow.org/guide/distributed_training
126			# only model, optimizer and metrics need to be defined inside the strategy
127			num_devices = max(len(tf.config.list_physical_devices("GPU")), 1)
128			if num_devices > 1:
129			strategy = tf.distribute.MirroredStrategy() # pragma: no cover
130			else:
131			strategy = tf.distribute.get_strategy()
132			with strategy.scope():
133			model: tf.keras.Model = REGISTRY.build_model(
134			config=dict(
135			name=config["train"]["method"],
136			moving_image_size=data_loader_train.moving_image_shape,
137			fixed_image_size=data_loader_train.fixed_image_shape,
138			index_size=data_loader_train.num_indices,
139			labeled=config["dataset"]["labeled"],
140			batch_size=config["train"]["preprocess"]["batch_size"],
141			config=config["train"],
142			num_devices=num_devices,
143			)
144			)
145			optimizer = opt.build_optimizer(optimizer_config=config["train"]["optimizer"])
146
147			# compile
148			model.compile(optimizer=optimizer)
149			model.plot_model(output_dir=log_dir)
150
151			# build callbacks
152			tensorboard_callback = tf.keras.callbacks.TensorBoard(
153			log_dir=log_dir,
154			histogram_freq=config["train"]["save_period"],
155			update_freq=config["train"].get("update_freq", "epoch"),
156			)
157			ckpt_callback, initial_epoch = build_checkpoint_callback(
158			model=model,
159			dataset=dataset_train,
160			log_dir=log_dir,
161			save_period=config["train"]["save_period"],
162			ckpt_path=ckpt_path,
163			)
164			callbacks = [tensorboard_callback, ckpt_callback]
165
166			# train
167			# it's necessary to define the steps_per_epoch
168			# and validation_steps to prevent errors like
169			# BaseCollectiveExecutor::StartAbort Out of range: End of sequence
170			model.fit(
171			x=dataset_train,
172			steps_per_epoch=steps_per_epoch_train,
173			initial_epoch=initial_epoch,
174			epochs=config["train"]["epochs"],
175			validation_data=dataset_val,
176			validation_steps=steps_per_epoch_val,
177			callbacks=callbacks,
178			)
179
180			# close file loaders in data loaders after training
181			data_loader_train.close()
182			if data_loader_val is not None:
183			data_loader_val.close()
184
185
186			def main(args=None):
187			"""
188			Entry point for train script.
189
190			:param args: arguments
191			"""
192
193			parser = argparse.ArgumentParser()
194
195			parser.add_argument(
196			"--gpu",
197			"-g",
198			help="GPU index for training."
199			'-g "" for using CPU'
200			'-g "0" for using GPU 0'
201			'-g "0,1" for using GPU 0 and 1.',
202			type=str,
203			required=True,
204			)
205
206			parser.add_argument(
207			"--gpu_allow_growth",
208			"-gr",
209			help="Prevent TensorFlow from reserving all available GPU memory",
210			default=False,
211			)
212
213			parser.add_argument(
214			"--num_cpus",
215			help="Number of CPUs to be used, -1 means unlimited.",
216			type=int,
217			default=-1,
218			)
219
220			parser.add_argument(
221			"--ckpt_path",
222			"-k",
223			help="Path of the saved model checkpoint to load."
224			"No need to provide if start training from scratch.",
225			default="",
226			type=str,
227			required=False,
228			)
229
230			parser.add_argument(
231			"--log_dir", help="Path of log directory.", default="logs", type=str
232			)
233
234			parser.add_argument(
235			"--exp_name",
236			"-l",
237			help="Name of log directory."
238			"The directory is under log root, e.g. logs/ by default."
239			"If not provided, a timestamp based folder will be created.",
240			default="",
241			type=str,
242			)
243
244			parser.add_argument(
245			"--config_path",
246			"-c",
247			help="Path of config, must end with .yaml. Can pass multiple paths.",
248			type=str,
249			nargs="+",
250			required=True,
251			)
252
253			parser.add_argument(
254			"--max_epochs",
255			help="The maximum number of epochs, -1 means following configuration.",
256			type=int,
257			default=-1,
258			)
259
260			args = parser.parse_args(args)
261			train(
262			gpu=args.gpu,
263			config_path=args.config_path,
264			num_cpus=args.num_cpus,
265			gpu_allow_growth=args.gpu_allow_growth,
266			ckpt_path=args.ckpt_path,
267			log_dir=args.log_dir,
268			exp_name=args.exp_name,
269			max_epochs=args.max_epochs,
270			)
271
272
273			if __name__ == "__main__":
274			main() # pragma: no cover
275

DeepRegNet / DeepReg

Pull Request — main (#724)

deepreg.train A

Complexity

Size/Duplication

Importance

3 Functions

Duplication Side-by-Side

Filter issues like