Skip to content

Commit b548bde

Browse files
committed
adding yaml file to tfrecord encoding
1 parent b08b27b commit b548bde

17 files changed

+140
-135
lines changed

estimator.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
import os
1111
import sys
12-
from yaml import load, dump
12+
from yaml import load
1313
slim = tf.contrib.slim
1414

1515
#Open and read the yaml file:
16-
stream = open(os.path.join(os.getcwd(), "config_multiclass.yaml"))
16+
stream = open(os.path.join(os.getcwd(), "yaml","config_multiclass.yaml"))
1717
data = load(stream)
1818

1919
#=======Dataset Informations=======#
@@ -24,7 +24,8 @@
2424
gpu_p = data["gpu_p"]
2525
#Emplacement du checkpoint file
2626
checkpoint_dir= data["checkpoint_dir"]
27-
checkpoint_file = os.path.join(checkpoint_dir, "mobilenet_v2_1.4_224.ckpt")
27+
checkpoint_pattern = data["checkpoint_pattern"]
28+
checkpoint_file = os.path.join(checkpoint_dir, checkpoint_pattern)
2829
ckpt_state = tf.train.get_checkpoint_state(train_dir)
2930
image_size = data["image_size"]
3031
#Nombre de classes à prédire

estimator_multiclass.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,18 @@
1313
slim = tf.contrib.slim
1414

1515
#Open and read the yaml file:
16-
stream = open(os.path.join(os.getcwd(), "config_multilabel.yaml"))
16+
stream = open(os.path.join(os.getcwd(), "yaml","config_multilabel.yaml"))
1717
data = load(stream)
18-
18+
stream.close()
1919
#=======Dataset Informations=======#
2020
#==================================#
2121
dataset_dir = data["dataset_dir"]
2222
train_dir = os.path.join(os.getcwd(), "train")
23-
summary_dir = os.path.join(train_dir , "summary")
2423
gpu_p = data["gpu_p"]
2524
#Emplacement du checkpoint file
2625
checkpoint_dir= data["checkpoint_dir"]
27-
checkpoint_file = os.path.join(checkpoint_dir, "mobilenet_v2_1.4_224.ckpt")
26+
checkpoint_pattern = data["checkpoint_pattern"]
27+
checkpoint_file = os.path.join(checkpoint_dir, checkpoint_pattern)
2828
ckpt_state = tf.train.get_checkpoint_state(train_dir)
2929
image_size = data["image_size"]
3030
#Nombre de classes à prédire
@@ -55,18 +55,17 @@
5555
#Create log_dir:
5656
if not os.path.exists(train_dir):
5757
os.mkdir(os.path.join(os.getcwd(),train_dir))
58-
if not os.path.exists(summary_dir):
59-
os.mkdir(os.path.join(os.getcwd(),summary_dir))
58+
6059
#===================================================================== Training ===========================================================================#
6160
#Adding the graph:
6261
#Set the verbosity to INFO level
6362
tf.reset_default_graph()
64-
tf.logging.set_verbosity(tf.logging.INFO)
63+
tf.logging.set_verbosity(tf.logging.DEBUG)
6564

6665
def input_fn(mode, dataset_dir,file_pattern, file_pattern_for_counting, labels_to_name, batch_size, image_size):
6766
train_mode = mode==tf.estimator.ModeKeys.TRAIN
6867
with tf.name_scope("dataset"):
69-
dataset = get_dataset_multiclass("eval" if train_mode else "eval",
68+
dataset = get_dataset_multiclass("train" if train_mode else "eval",
7069
dataset_dir, file_pattern=file_pattern,
7170
file_pattern_for_counting=file_pattern_for_counting,
7271
labels_to_name=labels_to_name)
@@ -77,7 +76,7 @@ def input_fn(mode, dataset_dir,file_pattern, file_pattern_for_counting, labels_t
7776

7877
def model_fn(features, mode):
7978
train_mode = mode==tf.estimator.ModeKeys.TRAIN
80-
tf.summary.image("images",features['image/encoded'])
79+
tf.summary.image("images", features['image/encoded'])
8180
#Create the model inference
8281
with slim.arg_scope(mobilenet_v2.training_scope(is_training=train_mode, weight_decay=1e-4, stddev=5e-2, bn_decay=0.99)):
8382
#TODO: Check mobilenet_v1 module, var "excluding
@@ -98,13 +97,14 @@ def model_fn(features, mode):
9897
#TODO: Add a func to transform logit tensor to a label-like tensor
9998
# If value[][class_id]<0.5 then value[][class_id] = 0. else value[][class_id]= 1.
10099
#It is necessary for a multilabel classification problem
101-
100+
logits_sig = tf.nn.sigmoid(logits,name="Sigmoid")
101+
logits_sig = tf.to_float(tf.to_int32(logits_sig>=0.5))
102102
if mode != tf.estimator.ModeKeys.PREDICT:
103103
metrics = {
104-
'Accuracy': tf.metrics.accuracy(features['image/class/id'], logits, name="acc_op"),
105-
'Precision': tf.metrics.precision(features['image/class/id'], logits, name="precision_op"),
106-
'Recall': tf.metrics.recall(features['image/class/id'], logits, name="recall_op"),
107-
'Acc_Class': tf.metrics.mean_per_class_accuracy(features['image/class/id'], logits,len(labels_to_names), name="per_class_acc")
104+
'Accuracy': tf.metrics.accuracy(features['image/class/id'], logits_sig, name="acc_op"),
105+
'Precision': tf.metrics.precision(features['image/class/id'], logits_sig, name="precision_op"),
106+
'Recall': tf.metrics.recall(features['image/class/id'], logits_sig, name="recall_op"),
107+
'Acc_Class': tf.metrics.mean_per_class_accuracy(features['image/class/id'], logits_sig, len(labels_to_names), name="per_class_acc")
108108
}
109109
for name, value in metrics.items():
110110
items_list = value[1].get_shape().as_list()

get_tfrec.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import random
2+
import os
3+
import tensorflow as tf
4+
import yaml
5+
from utils.data_utils import _get_train_valid, _convert_dataset_bis
6+
7+
8+
9+
#====================================================DEFINE YOUR ARGUMENTS=======================================================================
10+
stream = open(os.path.join(os.getcwd(), "yaml","config_tfrec.yaml"))
11+
data = yaml.load(stream)
12+
print(data)
13+
dataset_dir = data["dataset_dir"]
14+
tfrecord_filename = data["tfrecord_filename"]
15+
validation_size = data["validation_size"]
16+
num_shards = data["num_shards"]
17+
class_names_to_ids = data["class_names_to_ids"]
18+
19+
20+
def main():
21+
#==============================================================CHECKS==========================================================================
22+
23+
#Check if there is a tfrecord_filename entered
24+
25+
if not tfrecord_filename:
26+
raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')
27+
28+
29+
30+
#Check if there is a dataset directory entered
31+
32+
if not dataset_dir:
33+
raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')
34+
35+
#==============================================================END OF CHECKS===================================================================
36+
37+
#Get a list of photos filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
38+
photos_train, class_train, photos_valid, class_valid = _get_train_valid(dataset_dir)
39+
40+
# First, convert the training and validation sets.
41+
_convert_dataset_bis('train', photos_train, class_train, class_names_to_ids,
42+
dataset_dir = dataset_dir, tfrecord_filename = tfrecord_filename, batch_size=500, _NUM_SHARDS = num_shards)
43+
_convert_dataset_bis('eval', photos_valid, class_valid, class_names_to_ids,
44+
dataset_dir = dataset_dir, tfrecord_filename = tfrecord_filename, batch_size=200, _NUM_SHARDS = num_shards)
45+
46+
print('\n Finished converting the %s dataset!' % (tfrecord_filename))
47+
48+
if __name__ == "__main__":
49+
main()

tfrecord/get_tfrec_csv.py renamed to get_tfrec_csv.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import tensorflow as tf
44

5-
from utils.utils_csv import _dataset_exists, _get_infos, _convert_dataset
5+
from utils.utils_csv import _dataset_exists, _get_infos, _convert_dataset_multilabel
66

77
import pandas as pd
88

@@ -27,22 +27,22 @@
2727

2828
#TODO change this dict into names to ids
2929
class_names_to_ids = {
30-
'No Finding':0,
31-
'Atelectasis':1,
32-
'Cardiomegaly':2,
33-
'Effusion':3,
34-
'Infiltration':4,
35-
'Mass':5,
36-
'Nodule':6,
37-
'Pneumonia':7,
38-
'Pneumothorax':8,
39-
'Consolidation':9,
40-
'Edema':10,
41-
'Emphysema':11,
42-
'Fibrosis':12,
43-
'Pleural_Thickening':13,
44-
'Hernia':14
45-
}
30+
'No Finding': 0,
31+
'Atelectasis' : 1,
32+
'Cardiomegaly' : 2,
33+
'Effusion' : 3,
34+
'Infiltration' : 4,
35+
'Mass' : 5,
36+
'Nodule' : 6,
37+
'Pneumonia' : 7,
38+
'Pneumothorax' : 8,
39+
'Consolidation' : 9,
40+
'Edema' : 10,
41+
'Emphysema' : 11,
42+
'Fibrosis' : 12,
43+
'Pleural_Thickening' : 13,
44+
'Hernia' : 14,
45+
}
4646

4747
def main():
4848
#==============================================================CHECKS==========================================================================
@@ -70,16 +70,17 @@ def main():
7070
#==============================================================END OF CHECKS===================================================================
7171
grouped=_get_infos(FLAGS.dataset_dir,"Data_Entry_2017.csv")
7272
# Divide the training datasets into train and test:(For ChestX like datasets)
73+
7374

7475
training_filenames = pd.DataFrame.sample(grouped, frac=(1-FLAGS.validation_size))
75-
training_filenames = pd.DataFrame.sample(training_filenames, frac=1,random_state=3)
76+
training_filenames = pd.DataFrame.sample(training_filenames, frac=1,random_state=100)
7677
validation_filenames = grouped.loc[~grouped.index.isin(training_filenames.index), :]
7778
valid_filenames = pd.DataFrame.sample(validation_filenames, frac=1,random_state=3)
7879

7980
# First, convert the training and validation sets.
80-
_convert_dataset('eval', valid_filenames, class_names_to_ids,
81+
_convert_dataset_multilabel('eval', valid_filenames, class_names_to_ids,
8182
dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS=1)
82-
_convert_dataset('train', training_filenames, class_names_to_ids,
83+
_convert_dataset_multilabel('train', training_filenames, class_names_to_ids,
8384
dataset_dir = FLAGS.dataset_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS=FLAGS.num_shards)
8485

8586

File renamed without changes.

rnn/__init__.py

Whitespace-only changes.

tfrecord/get_tfrec.py

-62
This file was deleted.

utils/data_utils.py

+19-22
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ def compute_stats_fn(image_data):
2626
tf.squeeze(b_mean), tf.squeeze(b_stddev)])
2727
return result
2828

29-
def computes_stats(sess, images_data, batch_size):
30-
images = tf.placeholder(dtype=tf.string, shape=[batch_size])
31-
results = tf.map_fn(lambda x: compute_stats_fn(x), images, dtype=tf.float32)
29+
def computes_stats(sess, images_data):
30+
images = tf.placeholder(dtype=tf.string, shape=[None])
31+
results = tf.map_fn(lambda x: compute_stats_fn(x), images, dtype=tf.float32,
32+
parallel_iterations=4)
3233
alpha = sess.run(results, feed_dict={images:images_data})
3334
GEN_mean, GEN_stddev, R_mean,\
3435
R_stddev, G_mean, G_stddev, B_mean,\
@@ -164,7 +165,6 @@ class names. Each subdirectory should contain PNG or JPG encoded images.
164165

165166
for root, _ , files in os.walk(dataset_dir):
166167
path = root.split(os.sep)
167-
print(path)
168168
for file in files:
169169
photo_filenames.append(os.path.join(root,file))
170170
class_names.append(path[-1].split("_")[-1])
@@ -246,44 +246,41 @@ def _convert_dataset_bis(split_name, filenames, class_name, class_names_to_ids,
246246
images_data = []
247247
class_id_data = []
248248
assert split_name in ['train', 'eval']
249-
max_id = int(math.ceil(len(filenames) / float(batch_size)))
249+
lenght = len(filenames)
250250
output_filename = _get_dataset_filename(
251251
dataset_dir, split_name, tfrecord_filename = tfrecord_filename,stats=False)
252252
output_filename_stats = _get_dataset_filename(
253253
dataset_dir, split_name, tfrecord_filename = tfrecord_filename,stats=True)
254+
tfrecord_stats = tf.python_io.TFRecordWriter(output_filename_stats)
254255

255256
with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer_1:
256-
for i in range(len(filenames)):
257+
for i in range(lenght):
257258
# Read the filename:
258259
image_data = tf.gfile.FastGFile(filenames[i], 'rb').read()
259260
images_data.append(image_data)
260261
class_id = class_names_to_ids[class_name[i]]
261262
class_id_data.append(class_id)
262263
example_image = image_to_tfexample(image_data, class_id)
263264
tfrecord_writer_1.write(example_image.SerializeToString())
264-
with tf.Graph().as_default():
265-
with tf.Session('') as sess:
266-
with tf.python_io.TFRecordWriter(output_filename_stats) as tfrecord_writer:
267-
for i in range(max_id):
268-
start_ndx = i * batch_size
269-
end_ndx = min((i+1) * batch_size, len(filenames))
270-
try:
265+
if (i+1) % batch_size == 0 or i == lenght-1:
266+
with tf.Graph().as_default():
267+
with tf.Session('') as sess:
271268
gen_mean, gen_stddev, r_mean, r_stddev,\
272269
g_mean, g_stddev, b_mean,\
273-
b_stddev = computes_stats(sess, images_data[start_ndx:end_ndx], end_ndx-start_ndx)
270+
b_stddev = computes_stats(sess, images_data)
274271
for j in range(len(gen_mean)):
275-
sys.stdout.write('\r>> Converting stats %d/%d shard %d' % (
276-
j+start_ndx, len(filenames), i))
272+
sys.stdout.write('\r>> Converting stats %d/%d' % (
273+
i+1, lenght))
277274
sys.stdout.flush()
278275
#Py3: use encode("utf-8")
279276
example = stats_to_tfexample(gen_mean[j],
280277
gen_stddev[j], r_mean[j], r_stddev[j],
281278
g_mean[j], g_stddev[j], b_mean[j],
282-
b_stddev[j],class_name[start_ndx+j].encode(),
283-
class_id_data[start_ndx+j])
284-
tfrecord_writer.write(example.SerializeToString())
285-
except:
286-
print("batch of image is corrupted")
279+
b_stddev[j],class_name[j].encode(),
280+
class_id_data[j])
281+
tfrecord_stats.write(example.SerializeToString())
282+
images_data = []
283+
class_id_data = []
287284
sys.stdout.write('\n')
288285
sys.stdout.flush()
289286

@@ -312,7 +309,7 @@ def _convert_dataset_multi(split_name, filenames, class_first_name, class_snd_na
312309
sys.stdout.flush()
313310
# Read the filename:
314311
image_data = tf.gfile.FastGFile(filenames[i], 'rb').read()
315-
#TODO/This line is Special to MURA dataset for defining 13 classes.
312+
#TODO/The following line is Special to MURA dataset for defining 13 classes.
316313
class_id = class_names_to_ids[class_snd_name[i]+"_"+class_first_name[i]]
317314
example_image = image_to_tfexample(image_data, class_id)
318315
tfrecord_writer.write(example_image.SerializeToString())

utils/gen_tfrec.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ def process_fn(example):
118118
return example
119119
dataset = dataset.map(process_fn)
120120
if is_training and shuffle:
121-
dataset = dataset.shuffle(1000)
122-
dataset = dataset.repeat()
123-
dataset = dataset.repeat(num_epochs)
121+
dataset = dataset.shuffle(2000)
122+
dataset = dataset.repeat(-1)
123+
else:
124+
#Evaluation or test cases:
125+
dataset = dataset.repeat(1)
124126
dataset = dataset.batch(batch_size)
125127
return dataset

utils/text/__init__.py

Whitespace-only changes.
File renamed without changes.

0 commit comments

Comments
 (0)