Expose DB backend option in web UI

lukeyeager · lukeyeager · commit 10f3cd21925a · 2015-09-09T17:34:01.000-07:00
Show correct filesize approximation for HDF5
diff --git a/digits/dataset/images/classification/forms.py b/digits/dataset/images/classification/forms.py
@@ -15,6 +15,29 @@ class ImageClassificationDatasetForm(ImageDatasetForm):
     Defines the form used to create a new ImageClassificationDatasetJob
     """
 
+    backend = wtforms.SelectField('DB backend',
+            choices = [
+                ('lmdb', 'LMDB'),
+                ('hdf5', 'HDF5'),
+                ],
+            default='lmdb',
+            )
+
+    def validate_backend(form, field):
+        if field.data == 'lmdb':
+            form.compression.data = 'none'
+        elif field.data == 'hdf5':
+            form.encoding.data = 'none'
+
+    compression = utils.forms.SelectField('DB compression',
+            choices = [
+                ('none', 'None'),
+                ('gzip', 'GZIP'),
+                ],
+            default='none',
+            tooltip='Compressing the dataset may significantly decrease the size of your database files, but it may increase read and write times.',
+            )
+
     # Use a SelectField instead of a HiddenField so that the default value
     # is used when nothing is provided (through the REST API)
     method = wtforms.SelectField(u'Dataset type',
diff --git a/digits/dataset/images/classification/views.py b/digits/dataset/images/classification/views.py
@@ -84,17 +84,21 @@ def from_folders(job, form):
 
     ### Add CreateDbTasks
 
+    backend = form.backend.data
     encoding = form.encoding.data
+    compression = form.compression.data
 
     job.tasks.append(
             tasks.CreateDbTask(
                 job_dir     = job.dir(),
                 parents     = parse_train_task,
                 input_file  = utils.constants.TRAIN_FILE,
                 db_name     = utils.constants.TRAIN_DB,
+                backend     = backend,
                 image_dims  = job.image_dims,
                 resize_mode = job.resize_mode,
                 encoding    = encoding,
+                compression = compression,
                 mean_file   = utils.constants.MEAN_FILE_CAFFE,
                 labels_file = job.labels_file,
                 )
@@ -107,9 +111,11 @@ def from_folders(job, form):
                     parents     = val_parents,
                     input_file  = utils.constants.VAL_FILE,
                     db_name     = utils.constants.VAL_DB,
+                    backend     = backend,
                     image_dims  = job.image_dims,
                     resize_mode = job.resize_mode,
                     encoding    = encoding,
+                    compression = compression,
                     labels_file = job.labels_file,
                     )
                 )
@@ -121,9 +127,11 @@ def from_folders(job, form):
                     parents     = test_parents,
                     input_file  = utils.constants.TEST_FILE,
                     db_name     = utils.constants.TEST_DB,
+                    backend     = backend,
                     image_dims  = job.image_dims,
                     resize_mode = job.resize_mode,
                     encoding    = encoding,
+                    compression = compression,
                     labels_file = job.labels_file,
                     )
                 )
@@ -141,8 +149,10 @@ def from_files(job, form):
                 )
         job.labels_file = utils.constants.LABELS_FILE
 
-    encoding = form.encoding.data
     shuffle = bool(form.textfile_shuffle.data)
+    backend = form.backend.data
+    encoding = form.encoding.data
+    compression = form.compression.data
 
     ### train
     if form.textfile_use_local_files.data:
@@ -162,10 +172,12 @@ def from_files(job, form):
                 job_dir     = job.dir(),
                 input_file  = train_file,
                 db_name     = utils.constants.TRAIN_DB,
+                backend     = backend,
                 image_dims  = job.image_dims,
                 image_folder= image_folder,
                 resize_mode = job.resize_mode,
                 encoding    = encoding,
+                compression = compression,
                 mean_file   = utils.constants.MEAN_FILE_CAFFE,
                 labels_file = job.labels_file,
                 shuffle     = shuffle,
@@ -192,10 +204,12 @@ def from_files(job, form):
                     job_dir     = job.dir(),
                     input_file  = val_file,
                     db_name     = utils.constants.VAL_DB,
+                    backend     = backend,
                     image_dims  = job.image_dims,
                     image_folder= image_folder,
                     resize_mode = job.resize_mode,
                     encoding    = encoding,
+                    compression = compression,
                     labels_file = job.labels_file,
                     shuffle     = shuffle,
                     )
@@ -221,10 +235,12 @@ def from_files(job, form):
                     job_dir     = job.dir(),
                     input_file  = test_file,
                     db_name     = utils.constants.TEST_DB,
+                    backend     = backend,
                     image_dims  = job.image_dims,
                     image_folder= image_folder,
                     resize_mode = job.resize_mode,
                     encoding    = encoding,
+                    compression = compression,
                     labels_file = job.labels_file,
                     shuffle     = shuffle,
                     )
diff --git a/digits/dataset/images/views.py b/digits/dataset/images/views.py
@@ -28,14 +28,15 @@ def image_dataset_resize_example():
         height = int(flask.request.form['height'])
         channels = int(flask.request.form['channels'])
         resize_mode = flask.request.form['resize_mode']
+        backend = flask.request.form['backend']
         encoding = flask.request.form['encoding']
 
         image = utils.image.resize_image(image, height, width,
                 channels=channels,
                 resize_mode=resize_mode,
                 )
 
-        if encoding == 'none':
+        if backend != 'lmdb' or encoding == 'none':
             length = len(image.tostring())
         else:
             s = StringIO()
diff --git a/digits/dataset/tasks/create_db.py b/digits/dataset/tasks/create_db.py
@@ -17,18 +17,20 @@
 class CreateDbTask(Task):
     """Creates a database"""
 
-    def __init__(self, input_file, db_name, image_dims, **kwargs):
+    def __init__(self, input_file, db_name, backend, image_dims, **kwargs):
         """
         Arguments:
         input_file -- read images and labels from this file
         db_name -- save database to this location
+        backend -- database backend (lmdb/hdf5)
         image_dims -- (height, width, channels)
 
         Keyword Arguments:
         image_folder -- prepend image paths with this folder
         shuffle -- shuffle images before saving
         resize_mode -- used in utils.image.resize_image()
         encoding -- 'none', 'png' or 'jpg'
+        compression -- 'none' or 'gzip'
         mean_file -- save mean file to this location
         labels_file -- used to print category distribution
         """
@@ -37,6 +39,7 @@ def __init__(self, input_file, db_name, image_dims, **kwargs):
         self.shuffle = kwargs.pop('shuffle', True)
         self.resize_mode = kwargs.pop('resize_mode' , None)
         self.encoding = kwargs.pop('encoding', None)
+        self.compression = kwargs.pop('compression', None)
         self.mean_file = kwargs.pop('mean_file', None)
         self.labels_file = kwargs.pop('labels_file', None)
 
@@ -45,6 +48,7 @@ def __init__(self, input_file, db_name, image_dims, **kwargs):
 
         self.input_file = input_file
         self.db_name = db_name
+        self.backend = backend
         self.image_dims = image_dims
         if image_dims[2] == 3:
             self.image_channel_order = 'BGR'
@@ -87,6 +91,11 @@ def __setstate__(self, state):
                 self.encoding = 'none'
         self.pickver_task_createdb = PICKLE_VERSION
 
+        if not hasattr(self, 'backend'):
+            self.backend = 'lmdb'
+        if not hasattr(self, 'compression'):
+            self.compression = 'none'
+
     @override
     def name(self):
         if self.db_name == utils.constants.TRAIN_DB or 'train' in self.db_name.lower():
@@ -133,6 +142,7 @@ def task_arguments(self, resources):
                 self.path(self.db_name),
                 self.image_dims[1],
                 self.image_dims[0],
+                '--backend=%s' % self.backend,
                 '--channels=%s' % self.image_dims[2],
                 '--resize_mode=%s' % self.resize_mode,
                 ]
@@ -147,6 +157,8 @@ def task_arguments(self, resources):
             args.append('--shuffle')
         if self.encoding and self.encoding != 'none':
             args.append('--encoding=%s' % self.encoding)
+        if self.compression and self.compression != 'none':
+            args.append('--compression=%s' % self.compression)
 
         return args
 
diff --git a/digits/templates/datasets/images/classification/new.html b/digits/templates/datasets/images/classification/new.html
@@ -33,15 +33,6 @@ <h1>New Image Classification Dataset</h1>
                     {{ form.resize_channels(class='form-control') }}
                 </div>
             </div>
-            <div class="row">
-                <div class="form-group{{ ' has-error' if form.encoding.errors else '' }}">
-                    <div class="form-group{{' has-error' if form.encoding.errors}}">
-                        {{form.encoding.label}}
-                        {{form.encoding.tooltip}}
-                        {{form.encoding(class='form-control')}}
-                    </div>
-                </div>
-            </div>
             <div class="row">
                 <div class="form-group{{ ' has-error' if form.resize_width.errors or form.resize_height.errors else '' }}">
                     <label>Image size</label>
@@ -82,6 +73,7 @@ <h1>New Image Classification Dataset</h1>
                 "width":    $("#resize_width").val(),
                 "height":   $("#resize_height").val(),
                 "resize_mode":   $("#resize_mode").val(),
+                "backend": $("#backend").val(),
                 "encoding": $("#encoding").val(),
             },
             function(response) {
@@ -369,6 +361,49 @@ <h1>New Image Classification Dataset</h1>
 
     <div class="row">
         <div class="col-sm-6 col-sm-offset-3 well">
+            <div class="form-group{{ ' has-error' if form.backend.errors else '' }}">
+                {{ form.backend.label }}
+                {{ form.backend.tooltip }}
+                {{ form.backend(class='form-control') }}
+            </div>
+            <div id="backend-hdf5-warning" class="alert alert-warning" style="display:none;">
+                <b>NOTE:</b> HDF5 is not fully supported by Caffe or by DIGITS
+                <ul>
+                    <li>The standard networks will need some minor customizations before use (change <i>Data</i> layers to <i>HDF5Data</i> layers)</li>
+                    <li><i>HDF5Data</i> layers do not support mean subtraction</li>
+                </ul>
+            </div>
+            <div class="form-group{{ ' has-error' if form.compression.errors else '' }}">
+                <div class="form-group{{' has-error' if form.compression.errors}}">
+                    {{form.compression.label}}
+                    {{form.compression.tooltip}}
+                    {{form.compression(class='form-control')}}
+                </div>
+            </div>
+            <div class="form-group{{ ' has-error' if form.encoding.errors else '' }}">
+                <div class="form-group{{' has-error' if form.encoding.errors}}">
+                    {{form.encoding.label}}
+                    {{form.encoding.tooltip}}
+                    {{form.encoding(class='form-control')}}
+                </div>
+            </div>
+            <script>
+function backendChanged()
+{
+    val = $("#backend").val();
+    if (val == 'lmdb') {
+        $("#compression").parent().hide();
+        $("#encoding").parent().show();
+        $("#backend-hdf5-warning").hide();
+    } else if (val == 'hdf5') {
+        $("#encoding").parent().hide();
+        $("#compression").parent().show();
+        $("#backend-hdf5-warning").show();
+    }
+}
+$("#backend").change(backendChanged);
+backendChanged();
+            </script>
             <div class="form-group{{ ' has-error' if form.dataset_name.errors else '' }}">
                 {{ form.dataset_name.label }}
                 {{ form.dataset_name(class='form-control') }}
diff --git a/digits/templates/datasets/images/classification/show.html b/digits/templates/datasets/images/classification/show.html
@@ -12,14 +12,18 @@ <h4>Job Information</h4>
         <dd>{{ job.dir() }}</dd>
     </dl>
     <dl>
-        <dt>Image Type</dt>
-        <dd>{{'Color' if job.image_dims[2] == 3 else 'Grayscale'}}</dd>
-        <dt>Image Encoding</dt>
-        <dd>{{job.train_db_task().encoding}}</dd>
         <dt>Image Dimensions</dt>
         <dd>{{job.image_dims[1]}}x{{job.image_dims[0]}}</dd>
+        <dt>Image Type</dt>
+        <dd>{{'Color' if job.image_dims[2] == 3 else 'Grayscale'}}</dd>
         <dt>Resize Transformation</dt>
         <dd>{{ job.resize_mode_name() }}</dd>
+        <dt>DB Backend</dt>
+        <dd>{{job.train_db_task().backend}}</dd>
+        <dt>Image Encoding</dt>
+        <dd>{{job.train_db_task().encoding}}</dd>
+        <dt>DB Compression</dt>
+        <dd>{{job.train_db_task().compression}}</dd>
     </dl>
 </div>
 {% endmacro %}
diff --git a/docs/API.md b/docs/API.md
@@ -1,6 +1,6 @@
 # REST API
 
-*Generated Sep 01, 2015*
+*Generated Sep 02, 2015*
 
 DIGITS exposes its internal functionality through a REST API. You can access these endpoints by performing a GET or POST on the route, and a JSON object will be returned.
 
diff --git a/docs/FlaskRoutes.md b/docs/FlaskRoutes.md
@@ -1,6 +1,6 @@
 # Flask Routes
 
-*Generated Sep 01, 2015*
+*Generated Sep 02, 2015*
 
 Documentation on the various routes used internally for the web application.