Machine learning based interpolation for regional water table w. Python and Scikit Learn - Tutorial

Having a reasonable spatial distribution of the water table with few observation points is a challenge because the water table can't be above the surface. We wanted to develop a method where the computer learns not only about the position but also the surface to calculate the water table.

This is an applied example of water level interpolation based on a machine learning algorithm in Python with Scikit Learn. The code covers all steps from data compilation, regression formulation, accuracy testing and raster generation.

Tutorial

Code

Working on the data compilation

# import packages to compile the data
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
### creation of a dataframe for the neural network
wakeLoc = pd.read_csv('../Csv/WakeCoNC_slugtest_Wells.csv', usecols=[2,3,4,6])
wakeLoc = wakeLoc.set_index('well_name_short')
wakeLoc.head()

latitude_dd_nad1983 longitude_dd_nad1983 alt_mp_navd1988
well_name_short
CH-252 35.815336 -78.925508 336.48
WK-283 35.733056 -78.675556 359.70
WK-332 35.720833 -78.500556 192.26
WK-334 35.720833 -78.500556 190.95
WK-368 35.943000 -78.684000 410.06
#add a value of hydraulic conductiviy. We read the hydraulic conductivities from another file and create a mean
wakeK = pd.read_csv('../Csv/WakeCoNC_slugtest_Analyses.csv',usecols=[1,6,19])
meanValues = wakeK.groupby('well_name_short')[['wl_static_dbls']].mean()
meanValues[:5]

wl_static_dbls
well_name_short
CH-252 41.2775
WK-283 29.9800
WK-332 16.8275
WK-334 15.2825
WK-368 121.0770
# apply the mean K to the original dataframe
wakeLoc['wlElev'] = -999.0
for index, row in wakeLoc.iterrows():
    wakeLoc.loc[index,'wlElev'] = wakeLoc.loc[index,'alt_mp_navd1988'] - meanValues.loc[index,'wl_static_dbls']
wakeLoc.to_csv('../Csv/wellDf.csv')
wakeLoc.tail()

latitude_dd_nad1983 longitude_dd_nad1983 alt_mp_navd1988 wlElev
well_name_short
WK-433 35.585893 -78.678039 326.92 322.2050
WK-434 35.585893 -78.678039 328.43 301.4225
WK-435 35.642404 -78.825825 434.40 411.3175
WK-436 35.695548 -78.766925 435.93 412.4400
WK-437 35.695548 -78.766925 436.87 406.9200

Machine learning model construction

# import required packages
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
#define the two datasets
headValues = wakeLoc.iloc[:,3].values.reshape(-1,1) #because it is a single column
paramValues = wakeLoc.iloc[:,:3].values
# Define the scaler
headScaler = StandardScaler().fit(headValues)

# Scale the train set
headValuesScaled = headScaler.transform(headValues)
headValuesScaled[:5]
array([[-0.24772745],
       [ 0.27174055],
       [-2.05019526],
       [-2.04665865],
       [-0.34132725]])
# Check scaler over head values
print(headValuesScaled.mean(axis=0))
print(headValuesScaled.std(axis=0))
[-1.43675921e-16]
[1.]
# Define the scaler
paramScaler = StandardScaler().fit(paramValues)
# Scale the train set
paramValuesScaled = paramScaler.transform(paramValues)
paramValuesScaled[:5]
array([[ 0.29272505, -2.01170002, -0.03127219],
       [-0.35552737, -0.2333869 ,  0.29555826],
       [-0.45182133,  1.01166934, -2.06122429],
       [-0.45182133,  1.01166934, -2.07966305],
       [ 1.29853513, -0.29346549,  1.00439467]])
# Check scaler over param values
print(paramValuesScaled.mean(axis=0))
print(paramValuesScaled.std(axis=0))
[-4.61330320e-14 -7.14199941e-14  1.69798816e-16]
[1. 1. 1.]
# Split over Train and Test
headTrain, headTest, paramTrain, paramTest = train_test_split(headValuesScaled, paramValuesScaled, 
                                                              test_size=0.2, random_state=42)
# Regressor
regr = MLPRegressor(hidden_layer_sizes=(100,100,100),max_iter=50000).fit(paramTrain, headTrain)
c:\Users\saulm\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:1650: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
# Get predicted values for test array
headPredict = regr.predict(paramTest)
# get the inverse for the test and predicted heads
headPredictedInversed = headScaler.inverse_transform(headPredict.reshape(-1,1))
headTestInversed = headScaler.inverse_transform(headTest.reshape(-1,1))
# Create single subplot
fig, ax = plt.subplots(figsize=(8, 6))

# Scatter plot
ax.scatter(headTestInversed, headPredictedInversed, alpha=0.7, label='Data')

# Regression line
x_line = np.linspace(headValues.min(), headValues.max(), 100).reshape(-1, 1)
ax.plot(x_line, x_line, color='red', linewidth=2, label='Indentity Line')

# Calculate metrics
r2 = r2_score(headTestInversed, headPredictedInversed)
rmse = np.sqrt(mean_squared_error(headTestInversed, headPredictedInversed))

# Add metrics as text
metrics_text = f'R² = {r2:.3f}\nRMSE = {rmse:.3f}'
ax.text(0.05, 0.95, metrics_text, transform=ax.transAxes, 
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat'))

ax.set_xlabel('Heads Observed')
ax.set_ylabel('Heads Predicted')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Create the interpolated raster

import rasterio
from rasterio.plot import show
#open dem and array
demFeet = rasterio.open('../Rst/areaDem_feet_NAD83_clip.tif')
demArray = demFeet.read(1)
demShape = demArray.shape
print(demShape)
(1094, 1440)
interpHeads = np.zeros(demShape)
paramTupleList = []

i = 0
for row in range(demShape[0]):
    for col in range(demShape[1]):
        x,y = demFeet.xy(row,col)
        #print(x,y)
        paramTuple = [y,x,demArray[row,col]] #lat, lon
        #print(x,y,demArray[row,col])
        #headPredict = v
        #paramTupleScaled = paramScaler.transform([paramTuple])
        paramTupleList.append(paramTuple)
     
        if i % 400000 == 0:
            print('processing %d cells, please be patient!'%i)
        i+=1
processing 0 cells, please be patient!
processing 400000 cells, please be patient!
processing 800000 cells, please be patient!
processing 1200000 cells, please be patient!
paramTupleScaledList = paramScaler.transform(paramTupleList)
demFeetMeta = demFeet.meta
demFeetMeta
{'driver': 'GTiff',
 'dtype': 'float32',
 'nodata': -3.4028234663852886e+38,
 'width': 1440,
 'height': 1094,
 'count': 1,
 'crs': CRS.from_wkt('GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.257222101004,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4269"]]'),
 'transform': Affine(0.000277776805555558, 0.0, -78.833190499,
        0.0, -0.0002777788464351009, 35.982082827)}
interpHeadsList = regr.predict(paramTupleScaledList)
interpHeadsListInversed = headScaler.inverse_transform(interpHeadsList.reshape(-1,1))
interpHeadsListInversed
array([[345.69730345],
       [348.84354167],
       [347.86908141],
       ...,
       [205.89600829],
       [205.91801105],
       [208.86160631]])
interpHeadsArray = interpHeadsListInversed.reshape(demShape)
plt.imshow(interpHeadsArray)
<matplotlib.image.AxesImage at 0x231937267b0>
interpRaster = rasterio.open('../Rst/interpRaster.tif','w',**demFeetMeta)

interpRaster.write(interpHeadsArray,1)

interpRaster.close()

Input data

You can download the input data from this link:

owncloud.hatarilabs.com/s/pyv9obJnyObR52m

Password: Hatarilabs

Comment

Saul Montoya

Saul Montoya es Ingeniero Civil graduado de la Pontificia Universidad Católica del Perú en Lima con estudios de postgrado en Manejo e Ingeniería de Recursos Hídricos (Programa WAREM) de la Universidad de Stuttgart con mención en Ingeniería de Aguas Subterráneas y Hidroinformática.

 

Suscribe to our online newsletter

Subscribe for free newsletter, receive news, interesting facts and dates of our courses in water resources.