puneith · elibixby · Mar 1, 2017 · Mar 1, 2017 · Mar 2, 2017 · Mar 2, 2017
diff --git a/census/lowlevel-tf/README.md b/census/lowlevel-tf/README.md
@@ -15,6 +15,10 @@ on Google Cloud Storage:
  * Training file is `adult.data.csv`
  * Evaluation file is `adult.test.csv`
 
+### Disclaimer
+The source of this dataset is from a third party. Google provides no representation,
+warranty, or other guarantees about the validity or any other aspects of this dataset.
+
 ```
 export CENSUS_DATA=census_data
 export TRAIN_FILE=adult.data.csv
@@ -62,10 +66,10 @@ export OUTPUT_DIR=census_output
 ```
 
 ```
-python trainer/task.py --train_data_path $CENSUS_DATA/$TRAIN_FILE \
-                       --eval_data_path $CENSUS_DATA/$EVAL_FILE \
-                       --output_dir $OUTPUT_DIR
-                       [--max_steps $MAX_STEPS]
+python trainer/task.py --train-data-paths $CENSUS_DATA/$TRAIN_FILE \
+                       --eval-data-paths $CENSUS_DATA/$EVAL_FILE \
+                       --job-dir $OUTPUT_DIR
+                       [--max-steps $MAX_STEPS]
 ```
 
 ### Using gcloud local
@@ -81,17 +85,16 @@ export OUTPUT_DIR=census_output
 gcloud beta ml local train --package-path trainer \
                            --module-name trainer.task \
                            -- \
-                           --train_data_path $CENSUS_DATA/$TRAIN_FILE \
-                           --eval_data_path $CENSUS_DATA/$EVAL_FILE \
-                           --output_dir $OUTPUT_DIR
+                           --train-data-paths $CENSUS_DATA/$TRAIN_FILE \
+                           --eval-data-paths $CENSUS_DATA/$EVAL_FILE \
+                           --job-dir $OUTPUT_DIR
 ```
 
 ### Using Cloud ML Engine
 Run the code on Cloud ML Engine using `gcloud`:
 
 ```
 export GCS_JOB_DIR=gs://<my-bucket>/path/to/my/jobs/job3
-export GCS_OUTPUT_DIR=gs://<my-bucket>/path/to/my/models/run3
 ```
 
 ```
@@ -102,9 +105,8 @@ gcloud beta ml jobs submit training $JOB_NAME \
                                     --package-path trainer/ \
                                     --region us-central1 \
                                     -- \
-                                    --train_data_path $TRAIN_GCS_FILE \
-                                    --eval_data_path $EVAL_GCS_FILE \
-                                    --output_dir $GCS_OUTPUT_DIR
+                                    --train-data-paths $TRAIN_GCS_FILE \
+                                    --eval-data-paths $EVAL_GCS_FILE
 ```
 ## Accuracy and Output
 You should see the output for default number of training steps and approx accuracy close to `80.25%`.
@@ -157,10 +159,10 @@ gcloud beta ml local train --package-path trainer \
                            --worker-count $WORKER_COUNT \
                            --distributed \
                            -- \
-                           --train_data_path $CENSUS_DATA/$TRAIN_FILE \
-                           --eval_data_path $CENSUS_DATA/$EVAL_FILE \
-                           --max_steps $MAX_STEPS \
-                           --output_dir $OUTPUT_DIR
+                           --train-data-paths $CENSUS_DATA/$TRAIN_FILE \
+                           --eval-data-paths $CENSUS_DATA/$EVAL_FILE \
+                           --max-steps $MAX_STEPS \
+                           --job-dir $OUTPUT_DIR
 ```
 
 ### Using Cloud ML Engine
@@ -169,7 +171,6 @@ Run the distributed training code on cloud using `gcloud`.
 ```
 export SCALE_TIER=STANDARD_1
 export GCS_JOB_DIR=gs://<my-bucket>/path/to/my/models/run3
-export GCS_OUTPUT_DIR=gs://<my-bucket>/path/to/my/models/run3
 ```
 
 ```
@@ -181,7 +182,35 @@ gcloud beta ml jobs submit training $JOB_NAME \
                                     --package-path trainer/ \
                                     --region us-central1 \
                                     -- \
-                                    --train_data_path $TRAIN_GCS_FILE \
-                                    --eval_data_path $EVAL_GCS_FILE \
-                                    --output_dir $GCS_OUTPUT_DIR
+                                    --train-data-paths $TRAIN_GCS_FILE \
+                                    --eval-data-paths $EVAL_GCS_FILE
+```
+
+# Hyperparameter Tuning
+Cloud ML Engine allows you to perform Hyperparameter tuning to find out the
+most optimal hyperparameters. See [Overview of Hyperparameter Tuning]
+(https://cloud.google.com/ml/docs/concepts/hyperparameter-tuning-overview) for more details.
+
+## Running Hyperparameter Job
+
+Running Hyperparameter job is almost exactly same as Training job except that
+you need to add the `--config` argument.
+
+```
+export HPTUNING_CONFIG=hptuning_config.yaml
+```
+
+```
+gcloud beta ml jobs submit training $JOB_NAME \
+                                    --scale-tier $SCALE_TIER \
+                                    --runtime-version 1.0 \
+                                    --config $HPTUNING_CONFIG \
+                                    --job-dir $GCS_JOB_DIR \
+                                    --module-name trainer.task \
+                                    --package-path trainer/ \
+                                    --region us-central1 \
+                                    -- \
+                                    --train-data-paths $TRAIN_GCS_FILE \
+                                    --eval-data-paths $EVAL_GCS_FILE \
+                                    --max-steps $MAX_STEPS
 ```
diff --git a/census/lowlevel-tf/hptuning_config.yaml b/census/lowlevel-tf/hptuning_config.yaml
@@ -0,0 +1,22 @@
+trainingInput:
+  hyperparameters:
+    goal: MAXIMIZE
+    hyperparameterMetricTag: accuracy
+    maxTrials: 4
+    maxParallelTrials: 2
+    params:
+      - parameterName: first_layer_size
+        type: INTEGER
+        minValue: 50
+        maxValue: 500
+        scaleType: UNIT_LINEAR_SCALE
+      - parameterName: num_layers
+        type: INTEGER
+        minValue: 1
+        maxValue: 15
+        scaleType: UNIT_LINEAR_SCALE
+      - parameterName: scale_factor
+        type: DOUBLE
+        minValue: 0.1
+        maxValue: 1.0
+        scaleType: UNIT_REVERSE_LOG_SCALE
diff --git a/census/lowlevel-tf/trainer/model.py b/census/lowlevel-tf/trainer/model.py
@@ -35,58 +35,50 @@
                        [''], [0.], [0.], [0.], [''], ['']]
 
 # Categorical columns with vocab size
-HASH_BUCKET_COLS = (('education', 16), ('marital_status', 7),
+CATEGORICAL_COLS = (('education', 16), ('marital_status', 7),
                     ('relationship', 6), ('workclass', 9), ('occupation', 15),
-                    ('native_country', 42))
-KEY_COLS = (('gender', ('female', 'male')), ('race', ('Amer-Indian-Eskimo',
-                                                      'Asian-Pac-Islander',
-                                                      'Black',
-                                                      'Other',
-                                                      'White')))
-
+                    ('native_country', 42), ('gender', 2), ('race', 5))
 
 CONTINUOUS_COLS = ('age', 'education_num', 'capital_gain', 'capital_loss',
                    'hours_per_week')
 
-CATEGORICAL_COLS = HASH_BUCKET_COLS + tuple((col, len(keys)) for col, keys in KEY_COLS)
 LABELS = [' <=50K', ' >50K']
 LABEL_COLUMN = 'income_bracket'
 
 UNUSED_COLUMNS = set(CSV_COLUMNS) - set(
     zip(*CATEGORICAL_COLS)[0] + CONTINUOUS_COLS + (LABEL_COLUMN,))
 
+TRAIN, EVAL, PREDICT = 'TRAIN', 'EVAL', 'PREDICT'
+
 
-# Graph creation section for training and evaluation
-def model_fn(features,
+def model_fn(mode,
+             features,
              labels,
              hidden_units=[100, 70, 50, 20],
-             learning_rate=0.5,
-             batch_size=40):
+             learning_rate=0.1):
   """Create a Feed forward network classification network
 
   Args:
-    input_x (tf.placeholder): Feature placeholder input
+    mode (string): Mode running training, evaluation or prediction
+    features (dict): Dictionary of input feature Tensors
+    labels (Tensor): Class label Tensor
     hidden_units (list): Hidden units
-    num_classes (int): Number of classes
+    learning_rate (float): Learning rate for the SGD
 
   Returns:
     Tuple (train_op, accuracy_op, global_step, predictions): Tuple containing
     training graph, accuracy graph, global step and predictions
   """
+  label_values = tf.constant(LABELS)
+
   # Convert categorical (string) values to one_hot values
-  for col, bucket_size in HASH_BUCKET_COLS:
+  for col, bucket_size in CATEGORICAL_COLS:
     features[col] = string_ops.string_to_hash_bucket_fast(
             features[col], bucket_size)
 
-  for col, keys in KEY_COLS:
-    table = tf.contrib.lookup.string_to_index_table_from_tensor(
-        tf.constant(keys))
-    features[col] = table.lookup(features[col])
-
-  for col, size in CATEGORICAL_COLS:
     features[col] = tf.squeeze(tf.one_hot(
         features[col],
-        size,
+        bucket_size,
         axis=1,
         dtype=tf.float32), axis=[2])
 
@@ -114,42 +106,102 @@ def model_fn(features,
 
   # Make predictions
   logits = curr_layer
-  probabilities = tf.nn.softmax(logits)
-  predictions = tf.argmax(probabilities, 1)
 
-  # Make labels a vector
-  labels = tf.squeeze(labels)
+  if mode in (PREDICT, EVAL):
+    probabilities = tf.nn.softmax(logits)
+    predicted_indices = tf.argmax(probabilities, 1)
 
-  # Build training operation.
-  global_step = tf.contrib.framework.get_or_create_global_step()
-  cross_entropy = tf.reduce_mean(
-        tf.nn.sparse_softmax_cross_entropy_with_logits(
-            logits=logits, labels=labels))
 
-  train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
+  if mode in (TRAIN, EVAL):
+    # Conver the string label column to indices
+    # Build a Hash Table inside the graph
+    table = tf.contrib.lookup.string_to_index_table_from_tensor(
+        label_values)
+
+    # Use the hash table to convert string labels to ints
+    label_indices = table.lookup(labels)
+
+    # Make labels a vector
+    label_indices_vector = tf.squeeze(label_indices)
+
+    # global_step is necessary in eval to correctly load the step
+    # of the checkpoint we are evaluating
+    global_step = tf.contrib.framework.get_or_create_global_step()
+
+  if mode == PREDICT:
+    # Convert predicted_indices back into strings
+    return {
+        'predictions': tf.gather(label_values, predicted_indices),
+        'confidence': tf.gather(probabilities, predicted_indices)
+    }
+
+  if mode == TRAIN:
+    # Build training operation.
+    cross_entropy = tf.reduce_mean(
+        tf.nn.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=label_indices_vector))
+    train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
         cross_entropy, global_step=global_step)
+    return train_op, global_step
+
+  if mode == EVAL:
+    # Return accuracy and area under ROC curve metrics
+    # See https://en.wikipedia.org/wiki/Receiver_operating_characteristic
+    # See https://www.kaggle.com/wiki/AreaUnderCurve
+    return {
+        'accuracy': tf.contrib.metrics.streaming_accuracy(
+            predicted_indices, label_indices),
+        'auroc': tf.contrib.metrics.streaming_auc(predicted_indices, label_indices)
+    }
+
+
+def build_serving_inputs(mode, default_batch_size=None):
+  if mode == 'CSV':
+    placeholders = {'csv_row': tf.placeholder(
+        shape=[default_batch_size],
+        dtype=tf.string
+    )}
+    features = parse_csv(placeholders['csv_row'])
+    features.pop(LABEL_COLUMN)
+  else:
+    feature_spec = {}
+    for feat in CONTINUOUS_COLS:
+      feature_spec[feat] = tf.FixedLenFeature(shape=[], dtype=tf.float32)
+
+    for feat, _ in CATEGORICAL_COLS:
+      feature_spec[feat] = tf.FixedLenFeature(shape=[], dtype=tf.string)
+
+    tf_record = tf.placeholder(
+        shape=[default_batch_size],
+        dtype=tf.string,
+        name='tf_record'
+    )
+    feature_scalars = tf.parse_example(tf_record, feature_spec)
+    features = {
+        key: tf.expand_dims(tensor, -1)
+        for key, tensor in feature_scalars.iteritems()
+    }
+    if mode == 'TF_RECORD':
+      placeholders = {'tf_record': tf_record}
+    else:
+      placeholders = feature_scalars
 
-  accuracy_op = tf.reduce_mean(tf.to_float(tf.equal(predictions, labels)))
+  return features, placeholders
 
-  return train_op, accuracy_op, global_step, predictions
 
+def parse_csv(rows_string_tensor):
+  """Takes the string input tensor and returns a dict of rank-2 tensors."""
 
-def parse_label_column(label_string_tensor):
-  """Parses a string tensor into the label tensor
-  Args:
-    label_string_tensor: Tensor of dtype string. Result of parsing the
-    CSV column specified by LABEL_COLUMN
-  Returns:
-    A Tensor of the same shape as label_string_tensor, should return
-    an int64 Tensor representing the label index for classification tasks,
-    and a float32 Tensor representing the value for a regression task.
-  """
-  # Build a Hash Table inside the graph
-  table = tf.contrib.lookup.string_to_index_table_from_tensor(
-      tf.constant(LABELS))
+  # Takes a rank-1 tensor and converts it into rank-2 tensor
+  # Example if the data is [1,2,3,4] then converts it into [[1],[2],[3],[4]]
+  row_columns = tf.expand_dims(rows_string_tensor, -1)
+  columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS)
+  features = dict(zip(CSV_COLUMNS, columns))
 
-  # Use the hash table to convert string labels to ints
-  return table.lookup(label_string_tensor)
+  # Remove unused columns
+  for col in UNUSED_COLUMNS:
+    features.pop(col)
+  return features
 
 
 def input_fn(filenames,
@@ -158,6 +210,9 @@ def input_fn(filenames,
              skip_header_lines=0,
              batch_size=40):
   """Generates an input function for training or evaluation.
+  This uses the input pipeline based approach using file name queue
+  to read data so that entire data is not loaded in memory.
+
   Args:
       filenames: [str] list of CSV files to read data from.
       num_epochs: int how many times through to read the data.
@@ -179,17 +234,9 @@ def input_fn(filenames,
 
   _, rows = reader.read_up_to(filename_queue, num_records=batch_size)
 
-  # model_fn expects rank 2 tensors.
-  row_columns = tf.expand_dims(rows, -1)
+  features = parse_csv(rows)
 
   # Parse the CSV File
-  columns = tf.decode_csv(row_columns, record_defaults=CSV_COLUMN_DEFAULTS)
-  features = dict(zip(CSV_COLUMNS, columns))
-
-  # Remove unused columns
-  for col in UNUSED_COLUMNS:
-    features.pop(col)
-
   if shuffle:
     # This operation builds up a buffer of rows so that, even between batches,
     # rows are fed to training in a suitably randomized order.
@@ -200,6 +247,6 @@ def input_fn(filenames,
         min_after_dequeue=batch_size*2 + 1,
         num_threads=multiprocessing.cpu_count(),
         enqueue_many=True,
+        allow_smaller_final_batch=True
     )
-  label_tensor = parse_label_column(features.pop(LABEL_COLUMN))
-  return features, label_tensor
+  return features, features.pop(LABEL_COLUMN)