$Zewei$ $Xu^{1,2}$; $Shaowen$ $Wang^{1,2}$; $Lawrence V.$ $Stanislawski^{3}$; $Zhe$ $Jiang^{4}$; $Nattapon$ $Jaroenchai^{1,2}$; $Arpan Man$ $Sainju^{4}$; $Ethan$ $Shavers^{3}$; $E. Lynn$ $Usery^{3}$; $Li$ $Chen^{2,5}$; $Zhiyu$ $Li^{1,2}$; $Bin$ $Su^{1,2}$
$^{1}$$Department$ $of$ $Geography$ $and$ $Geographic$ $Information$ $Science$, $University$ $of$ $Illinois$ $at$ $Urbana-Champaign$, $Urbana$, $IL$, $USA$
$^{2}$$CyberGIS$ $Center$ $for$ $Advanced$ $Digital$ $and$ $Spatial$ $Studies$, $University$ $of$ $Illinois$ $at$ $Urbana-Champaign$, $Urbana$, $IL$, $USA$
$^{3}$$U.S.$ $Geological$ $Survey$, $Center$ $of$ $Excellence$ $for$ $Geospatial$ $Information$ $Science$, $Rolla$, $MO$, $USA$
$^{4}$$Department$ $of$ $Computer$ $Science$, $University$ $of$ $Alabama$, $Tuscaloosa$, $AL$, $USA$
$^{5}$$School$ $of$ $Geosciences$ $and$ $Info-Physics$, $Central$ $South$ $University$, $Changsha$, $Hunan$, $China$
$Corresponding$ $Author:$ $nj7@illinois.edu$
Notebook Structure:
This part of code will generate the samples for training and validation process.
In this research, the training and validatin are generated in 4 secenarios base on the parameter m in the block below,
The code will generate 2000 samples then devide the sample into training (2/3) and valiation (1/3) sample sets. Then the data and label of the data will be save to .npy files.
You have to restart the kernel after running the code block below.
# install required libraries for preprocessing.
!pip install --user imgaug=0.4.0
!pip install --user intervaltree
!pip install --user numpy --upgrade
# Please restart the kernel after this block finished running.
First we extract the training and validation samples from the reference dataset. We can only sample small sample size due to the size of the dataset. Therefore, we must augment the trainging samples to increase the training sample size and generalize the smaples.
The code below shows the augmentation steps usign the pre-sampling patches in the "train_patchestop-left*.npy". The results are saved to train_data_aug*.npy and train_label_aug*.npy.
%reload_ext autoreload
%autoreload 2
root="/home/jovyan/shared_data/data/unet_streamline_detection/"
save_root="/home/jovyan/work/unet_streamline_detection/"
m = 'v2'
# Training data augmentation
import numpy as np
import os
import matplotlib.pyplot as plt
from imgaug import augmenters as iaa
# reference array
reference = np.load(root+'data/reference.npy')
# total prediction feature maps
total = np.load(root+'data/total.npy')
# Top-left coordinates of training patches
trainlo = np.load(root+'data/train_patches_top-left'+m+'.npy')
# add 200 buffer pixels to each patch
print("Load data complete!")
pad = 200
trainlo[:,0] += pad
trainlo[:,1] += pad
depth = total.shape[2]
reference = np.pad(reference,(pad,pad),'symmetric')
for i in range(depth):
temp = np.pad(total[:,:,i],(pad,pad),'symmetric')[:,:,np.newaxis]
if i == 0:
totaln = temp
else:
totaln = np.concatenate((totaln,temp),axis = 2)
def process(lpathc,lref):
# rotate a random degree between -50 and 130
lpathc = np.concatenate((lpathc,lref[:,:,np.newaxis]),axis = 2)
rotate = iaa.Affine(rotate=(-50, 130))
image1 = rotate.augment_image(lpathc)
# rotate a random degree between 230 and 310
rotate = iaa.Affine(rotate=(230, 310))
image2 = rotate.augment_image(lpathc)
#Scale a random ratio between 0.3 and 0.6
scale = iaa.Affine(scale={"x": (0.3, 0.6), "y": (0.3, 0.6)})
image3 = scale.augment_image(lpathc)
#Scale a random ratio between 1.5 and 2.0
scale = iaa.Affine(scale={"x": (1.5, 2.0), "y": (1.5, 2.0)})
image4 = scale.augment_image(lpathc)
# shear a random degree between -30 and 30
shear = iaa.Affine(shear=(-30, 30))
image5 = shear.augment_image(lpathc)
# flip horizontally
flip = iaa.Fliplr(1.0)
image6 = flip.augment_image(lpathc)
# Add Guassian noises
#gua = iaa.AdditiveGaussianNoise(scale=(10, 20))
#image6 = gua.augment_image(lpathc)
#ref6 = gua.augment_image(lref)
oii = []
orr = []
for i in [image1,image2,image3,image4,image5,image6]:
oii.append(i[pad:(pad+224),pad:(pad+224),:-1])
orr.append(i[pad:(pad+224),pad:(pad+224),-1])
return [oii,orr]
# Concatenate augmented training data based on different types of augmentations
pc = 0
train_data_aug = []
for i in range(len(trainlo)):
lo = trainlo[i]
lpatch = totaln[(lo[0]-pad):(lo[0]+224+pad),(lo[1]-pad):(lo[1]+224+pad),:]
lref = reference[(lo[0]-pad):(lo[0]+224+pad),(lo[1]-pad):(lo[1]+224+pad)]
if len(train_data_aug) == 0:
train_data_aug = lpatch[pad:(-pad),pad:(-pad),:][np.newaxis,:,:,:]
train_label_aug = lref[pad:(-pad),pad:(-pad)][np.newaxis,:,:]
else:
train_data_aug = np.concatenate((train_data_aug,lpatch[pad:(-pad),pad:(-pad),:][np.newaxis,:,:,:]),axis = 0)
train_label_aug = np.concatenate((train_label_aug,lref[pad:(-pad),pad:(-pad)][np.newaxis,:,:]),axis = 0)
[reim,rere] = process(lpatch,lref)
for j in range(6):
train_data_aug = np.concatenate((train_data_aug,reim[j][np.newaxis,:,:,:]),axis = 0)
train_label_aug = np.concatenate((train_label_aug,rere[j][np.newaxis,:,:]),axis = 0)
if i%30 == 0:
np.save(save_root+'data/gen/train_data_augP'+str(pc)+'.npy',train_data_aug)
np.save(save_root+'data/gen/train_label_augP'+str(pc)+'.npy',train_label_aug)
train_data_aug = []
train_label_aug = []
pc+=1
# store training data after different types of augmentations
np.save(save_root+'data/gen/train_data_augP'+str(pc)+'.npy',train_data_aug)
np.save(save_root+'data/gen/train_label_augP'+str(pc)+'.npy',train_label_aug)
print("augmentation saved!")
# Concatenate the training data of different augmentations
for i in range(pc+1):
temp = np.load(save_root+'data/gen/train_data_augP'+str(i)+'.npy')
templ = np.load(save_root+'data/gen/train_label_augP'+str(i)+'.npy')
if i == 0:
fdata = temp
fl = templ
else:
fdata = np.concatenate((fdata,temp),axis = 0)
fl = np.concatenate((fl,templ),axis = 0)
# remove unnesessary intermediate files
#os.system('rm /content/drive/My Drive/USGS/Notebooks/data/train_*_augP*.npy')
# Shuffle the finalized file and save as .npy
rand = np.arange(len(fdata))
np.random.shuffle(rand)
train_data_aug = fdata[rand]
train_label_aug = fl[rand]
np.save(save_root+'data/gen/train_data_aug'+m+'.npy',train_data_aug)
np.save(save_root+'data/gen/train_label_aug'+m+'.npy',train_label_aug[:,:,:,np.newaxis])
print("Training data augmentation complete!")
###### visualization ########
# import pdb
# pdb.set_trace()
# plt.imshow(lref[pad:(-pad),pad:(-pad)])
# plt.show()
# plt.imshow(lpatch[:,:,0][pad:(-pad),pad:(-pad)])
# plt.show()
# aug = ['rotate1','rotate2','scale1','scale2','shear','flip']
# for i in range(6):
# print(aug[i])
# plt.imshow(rere[i])
# plt.show()
# plt.imshow(reim[i][:,:,0])
# plt.show()
We genrate the testing samples using moving window method that covers entire part left from the training and validation samples.
Note: running this part may consume large memory
import os
import numpy as np
import copy
import random
# stream/non-stream sample size
size = 2000 # number of samples
patch_size = 224 #patch size of each sample
#Total data dimension: 3981*2640
mask = np.load(root+'data/mask.npy')
totaldata = np.load(root+'data/total.npy')
totaldata = np.concatenate((totaldata,mask[:,:,np.newaxis]),axis = 2)
label = np.load(root+'data/reference.npy')
print('Completed: Data Loading!')
# buffer size
buf = 30
it = 'full'
# Image dimension
IMG_WIDTH = 224
IMG_HEIGHT = 224
# moving window size = image_dimension - 2*buffer_size
mw = IMG_WIDTH - buf*2
# Number of trainig channels
# Adding padding for moving window
totalnew = np.pad(totaldata, (buf, buf), 'symmetric')
totalnew = totalnew[:,:,buf:(buf+5)]
dim = totaldata.shape[:2]
# number of patch rows
numr = dim[0]//(IMG_WIDTH - buf*2)#224
# number of patch columns
print('rows:'+str(numr))
numc = dim[1]//(IMG_WIDTH - buf*2)#224
print('columns:'+str(numc))
# Splitting the total data into patches
count = 0
for i in range(numr):
for j in range(numc):
count += 1
temp = totalnew[i*mw:(i*mw+224),j*mw:(j*mw+224),:][np.newaxis,:,:,:]
if count == 1:
total = temp#[:,:,:,:-1]
else:
total = np.concatenate((total, temp),axis = 0)
# Save the total dataset
np.save(save_root+'data/gen/prediction_data.npy',total)
print("Testing moving window is generate!")