www.kangwoo.kr/2020/03/21/kubeflow-katib-%ED%95%98%EC%9D%B4%ED%8D%BC-%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
apiVersion: "kubeflow.org/v1beta1"
kind: Experiment
metadata:
  namespace: ohkwon
  name: fashion-mnist-experiment-1
    
spec:
  objective:
    type: maximize
    goal: 0.99
    objectiveMetricName: Validation-accuracy
    additionalMetricNames:
      - accuracy
      - loss
      - Validation-loss
  
  algorithm:
    algorithmName: random
  parallelTrialCount: 2
  maxTrialCount: 50
  maxFailedTrialCount: 3
  parameters:
    - name: learning_rate
      parameterType: double
      feasibleSpace:
        min: "0.005"
        max: "0.015"
    - name: dropout_rate
      parameterType: double
      feasibleSpace:
        min: "0.1"
        max: "0.9"
    - name: opt
      parameterType: int
      feasibleSpace:
        min: "0"
        max: "1"
  trialTemplate:
    primaryContainerName: training-container
    trialParameters:
      - name: learning_rate
        description: Learning rate for the training model
        reference: learning_rate
      - name: dropout_rate
        description: Number of training model layers
        reference: dropout_rate
      - name: opt
        description: Training model optimizer (sdg, adam or ftrl)
        reference: opt
    trialSpec:
      apiVersion: batch/v1
      kind: Job
      spec:
        template:
          metadata:
            name: training-job
            namespace: ohkwon
            annotations:
              sidecar.istio.io/inject: "false"
          spec:
            containers:
              - name: training-container
                image: kubeflow-registry.default.svc.cluster.local:30000/fairing-job:FE0FBC27
                command:
                  - "python3"
                  - "/app/fashion_mnist-katib.py"
                  - "--learning_rate=${trialParameters.learning_rate}"
                  - "--dropout_rate=${trialParameters.dropout_rate}"
                  - "--opt=${trialParameters.opt}"
                ports:
                  - containerPort: 443
                  - containerPort: 80                  
 
            restartPolicy: Never
 
cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from tensorflow.python.keras.callbacks import Callback
from datetime import datetime, timezone
 
import tensorflow as tf
import argparse
import socket
import os
 
class MyFashionMnist(object):
    def train(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('--learning_rate''-lr', required=False, type=float, default = 0.01)
        parser.add_argument('--dropout_rate', required=False, type=float, default=0.2)
        
        parser.add_argument('--opt', required=False, type=int, default=1)
        
        args = parser.parse_args()
        mnist = tf.keras.datasets.mnist
        
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0
        
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28,28)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        
        model.summary()
        
        sgd  = tf.keras.optimizers.SGD(lr=args.learning_rate)
        adam = tf.keras.optimizers.Adam(lr=args.learning_rate)
        
        optimizers= [sgd, adam]
        
        model.compile(optimizer = optimizers[args.opt],
                     loss = 'sparse_categorical_crossentropy',
                     metrics = ['acc'])
        
        model.fit(x_train, y_train,
                 verbose=0,
                 validation_data=(x_test, y_test),
                 epochs=5,
                 callbacks=[KatibMetricLog()])
    
class KatibMetricLog(Callback):
    def on_epoch_end(self, epoch, logs=None):
        # RFC 3339
        local_time = datetime.now(timezone.utc).astimezone().isoformat()
        print("\nEpoch {}".format(epoch+1))
        print("{} accuracy={:.4f}".format(local_time, logs['acc']))
        print("{} loss={:.4f}".format(local_time, logs['loss']))
        print("{} Validation-accuracy={:.4f}".format(local_time, logs['val_acc']))
        print("{} Validation-loss={:.4f}".format(local_time, logs['val_loss']))
 
    
#     def on_batch_end(self, batch, logs={}):
#         local_time = datetime.now(timezone.utc).astimezone().isoformat()
 
#         print("{} batch={}".format(local_time, str(batch)))        
#         print("{} accuracy={:.4f}".format(local_time, logs['acc']))
#         print("{} loss={:.4f}".format(local_time, logs['loss']))
 
        
        
#     def on_epoch_begin(self, epoch, logs={}):
#         local_time = datetime.now(timezone.utc).astimezone().isoformat()
#         print("{} epoch={}".format(local_time, str(epoch)))
        
    
#     def on_epoch_end(self, epoch, logs={}):
#         local_time = datetime.now(timezone.utc).astimezone().isoformat()
#         print("{} Validation-accuracy={:.4f}".format(local_time, logs['val_acc']))
#         print("{} Validation-loss={:.4f}".format(local_time, logs['val_loss']))
        
#         return
    
if __name__=='__main__':
    if os.getenv('FAIRING_RUNTIME'Noneis None:
        from kubeflow import fairing
        from kubeflow.fairing.kubernetes import utils as k8s_utils
        
        DOCKER_REGISTRY = 'kubeflow-registry.default.svc.cluster.local:30000'
        
        fairing.config.set_builder(
        'append',
        image_name='fairing-job',
        base_image='brightfly/kubeflow-jupyter-lab:tf2.0-cpu',
        registry=DOCKER_REGISTRY,
        push=True)
        
        fairing.config.set_deployer('job',
                                   namespace='ohkwon',
                                   pod_spec_mutators=[
                                       k8s_utils.get_resource_mutator(cpu = 2, memory = 4)
                                   ])
        fairing.config.run()
        
 
        
    else:
        remote_train = MyFashionMnist()
        remote_train.train()
        
        
cs

'Devops' 카테고리의 다른 글

kubernetes config update 방법  (0) 2021.05.06
kfserving tutorial  (0) 2021.05.05
kubeflow's distributed learning  (0) 2021.05.04
aws deep learning containers  (0) 2021.05.04
aws readwrite many volume 생성법  (0) 2021.05.04

+ Recent posts