Commit da1191da authored by Jonas Müller's avatar Jonas Müller

prep cross validation with respect to time issue

Split based on hour + basic check for issue based on  the timedeltas
parent ab7eebc7
import copy
import os
import random
from datetime import datetime
from optparse import OptionParser
from pathlib import Path
from keras_frcnn.simple_parser import get_data
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def get_date2(filename):
if filename.startswith('IMG'):
filename = filename[4:]
#return filename[:8] # only dates
if filename == '20190508-WA0026.jpg':
filename = '20190508_123000.jpg'
return filename[:13]
def get_date(filename):
return get_date2(filename)[:-2]
#20190520_120309
def time_diff(date1, date2):
date1 = get_date2(date1)
date2 = get_date2(date2)
d1 = datetime.strptime(date1, '%Y%m%d_%H%M')
d2 = datetime.strptime(date2, '%Y%m%d_%H%M')
tmp = None
if d1 < d2:
tmp = d1
d1 = d2
d2 = tmp
duration = d1 - d2 # For build-in functions
duration_in_s = duration.total_seconds()
return duration_in_s, d1, d2
#if duration_in_s < 600:
# print(duration_in_s)
# print()
parser = OptionParser()
......@@ -23,44 +50,80 @@ if not options.annotation_all: # if filename is not given
parser.error('Error: path to annotation file data must be specified. Pass --path to command line')
all_data, classes_count, class_mapping = get_data(options.annotation_all)
dates = {}
file_path_dict = {}
index_dict = {}
for idx, entry in enumerate(all_data):
file_path_dict[idx] = (Path(entry['filepath']).name, entry['filepath'])
print(file_path_dict)
# list of indexes representing where each number represents one image
images = list(range(len(file_path_dict.keys())))
# shuffle the data
random.shuffle(images)
fold = list(split(images, options.num_folds))
print(fold)
filename = Path(entry['filepath']).name
date = get_date(filename)
print(date)
if date not in dates.keys():
dates[date] = [idx]
else:
dates[date].append(idx)
index_dict[idx] = (filename, entry['filepath'])
print(dates)
# print(index_dict)
len_of_dates = {}
for date in dates.keys():
if len(dates[date]) not in len_of_dates.keys():
len_of_dates[len(dates[date])] = [date]
else:
len_of_dates[len(dates[date])].append(date)
print(len_of_dates)
# prepare ordered list containing the appearing length in the dataset
keys_of_len_of_dates = list(len_of_dates.keys())
keys_of_len_of_dates.sort(reverse=True)
# Create empty lists for each fold as sublist of folds
folds = []
for i in range(options.num_folds):
folds.append([])
# Append always the next longest sublist to the next shortest fold
for key in keys_of_len_of_dates:
for date in len_of_dates[key]:
#find first min len fold
min_fold = 0
for idx, fold in enumerate(folds):
if len(fold) < len(folds[min_fold]):
min_fold = idx
folds[min_fold].extend(dates[date])
#print(folds)
for fold in folds:
fold.sort()
print('len', len(fold))
print(fold)
train = {}
test = {}
for i in range(len(fold)):
test[i] = copy.deepcopy(fold[i])
for j in range(len(fold)):
for i in range(len(folds)):
test[i] = copy.deepcopy(folds[i])
for j in range(len(folds)):
if i != j:
if i not in train.keys():
train[i] = copy.deepcopy(fold[j])
train[i] = copy.deepcopy(folds[j])
else:
train[i].extend(fold[j])
train[i].extend(folds[j])
train[i].sort()
test[i].sort()
#visualize what was done:
'''
for i in range(len(fold)):
for i in range(len(folds)):
print('train')
print(train[i])
print('test')
print(test[i])
print()
'''
for i in range(len(fold)):
# write to files
for i in range(len(folds)):
new_dir = output_path+'fold_'+str(i)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
......@@ -69,12 +132,12 @@ for i in range(len(fold)):
lines = []
for filenumber in test[i]:
for box in all_data[filenumber]['bboxes']:
lines.append(file_path_dict[filenumber][0] + ',' + str(box['x1']) + ',' + str(box['y1']) + ',' + str(box['x2']) + ',' + str(box['y2']) + ',' +box['class']+ '\n')
lines.append(index_dict[filenumber][0] + ',' + str(box['x1']) + ',' + str(box['y1']) + ',' + str(box['x2']) + ',' + str(box['y2']) + ',' + box['class'] + '\n')
file.writelines(lines)
with open(new_dir+'/annotations_train.txt', 'w') as file:
lines = []
for filenumber in train[i]:
for box in all_data[filenumber]['bboxes']:
lines.append(file_path_dict[filenumber][0] + ',' + str(box['x1']) + ',' + str(box['y1']) + ',' + str(box['x2']) + ',' + str(box['y2']) + ',' +box['class']+ '\n')
lines.append(index_dict[filenumber][0] + ',' + str(box['x1']) + ',' + str(box['y1']) + ',' + str(box['x2']) + ',' + str(box['y2']) + ',' + box['class'] + '\n')
file.writelines(lines)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment