# -*- coding: utf-8 -*-
"""
The a module sets up three objects from class functions.
- input_data() establishes where data is loaded from.
- configuration() establishes various configuration variables used in the rest of the code.
- output_data() establishes where data is written to.
These are intended to be changed by the configurationplusfiles_runner.py module.
"""
##### import statements
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
#%matplotlib inline
import welly
from welly import Well
import lasio
import glob
import pickle
import math
import os
##### import from other modules
##### Classes
[docs]class configuration:
"""
A class to keep configuration variables you might change between runs. That is why it has a large number of attributes listed below.
Types of information information stored in here would mandetory curves or mandatory tops, column names, name of the top you're trying to predict, etc.
The object created by this class is used throughout Predictatops, so many modules reimport it.
Be careful to not change something in one module close your code, start up later working with the next module and except your changes to persis unless you saved them or wrote them into the configurationplusfiles_runner.py file.
Parameters
----------
none:none
None.
Attributes
----------
csv_of_well_names_wTopsCuves__name : str
csv_of_well_names_wTopsCuves__name
csv_of_well_names_wTopCurves__path : str
csv_of_well_names_wTopsCuves__name
must_have_curves_list : list
An array of strings that are curve names like ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT']
curve_windows_for_rolling_features : list
Array of integers like [5,7,11,21]
must_have_tops__list : list
An array of tops list that could be integers or strings like [13000,14000]
target_top : str
A string or integer like 1300
top_under_target : str
A string or interger that is a top name and is the name of a top under the top you want to predict such as 14000
top_name_col_in_picks_df : str
The top name as it appears in the picks dataframe
siteID_col_in_picks_df : str
The string for the siteID column in the picks dataframe like 'SitID'
UWI : str
The string for the UWI column like "UWI"
DEPTH_col_in_featureCreation : str
The string for the depth column like "DEPT"
HorID_name_col_in_picks_df : str
The string for the horizon ID column like "HorID"
quality_col_name_in_picks_df : str
The string for the quality of the pick column like "Quality"
picks_depth_col_in_picks_df : str
The string for the pick column name like 'Pick'
col_topTarget_Depth_predBy_NN1thick : str
The string for the top target depth predicted by nearest neighbor thickess like 'topTarget_Depth_predBy_NN1thick'
quality_items_to_skip__list : str
The array of the integers for the quality of wells to optionally skip as not good quality picks. An example is [-1,0]
test : str
Honestly forget what this is come back and find out but is should be "test0"
pick_class_str : str
String for the top taget pick prediction column like 'TopTarget_Pick_pred'
threshold_returnCurvesThatArePresentInThisManyWells : int
The integer for the number of wells a curve has to be present in to be kept for example 2000
max_numb_wells_to_load : int
Max number of wells to load out of all the wells in the directory with wells. This is used for when you're testing. Example is 1000000
split_traintest_percent : float
The percent in 0 to 1 terms for train vs. split. You give the percent to keep. example is 0.8
kdtree_leaf : int
Levels of kdtree? default is 2
kdtree_k : int
Integer for number of neighbors or K in k nearest neighbor code for finding nearby wells for each well. Default is 8
rebalanceClassZeroMultiplier : int
When rebalancing class zero. The number of instances of class zero is duplicated by this times. Default is 100
rebalanceClass95Multiplier : int
When rebalancing class zero. The number of instances of class 95 is duplicated by this times. Default is 40
NN1_topTarget_DEPTH : str
The string used in the column that holds the depth of the top in the first nearest neighbor training well. For example 'NN1_topTarget_DEPTH'
NN1_TopHelper_DEPTH : str
Helper depth for calculations for NN1_topTarget_DEPTH. Example is "NN1_TopHelper_DEPTH"
trainOrTest : str
String for column that holds string of either train or test. Example is 'trainOrTest'
colsToNotTurnToFloats : list
List of columsn to not turn to floads during feature creation. Examples is ['UWI', 'SitID', 'trainOrTest','Neighbors_Obj']
zonesAroundTops : object
An object of class lables and depths around top to create those classes in. Example is {"100":[0],"95":[-0.5,0.5],"60":[-5,0.5],"70":[0.5,5],"0":[]} #### NOTE: The code in createFeat_withinZoneOfKnownPick(df,config) function in features.py current ASSUMES only 5 zone labels
columns_to_not_trainOn_andNotCurves : list
List of strings for names of columns to not train on and are not curves. Example is ['FromBotWell','FromTopWel''rowsToEdge','lat','lng', 'SitID','TopHelper_HorID','TopTarget_HorID','TopHelper_DEPTH','diff_Top_Depth_Real_v_predBy_NN1thick','diff_TopTarget_DEPTH_v_rowDEPT','diff_TopHelper_DEPTH_v_rowDEPT','class_DistFrPick_TopHelper','NewWell','LastBitWell','TopWellDept','BotWellDept','WellThickness','rowsToEdge','closTopBotDist','closerToBotOrTop','Neighbors_Obj']
columns_to_not_trainOn_andAreCurves : list
list of strings for columns to not train on that are curves. Example is ['RHOB','SP','CALI','COND','DELT','DENS','DPHI:1','DPHI:2','DT','GR:1','GR:2','IL','ILD:1','ILD:2','ILM','LITH','LLD','LLS','PHID','PHIN','RESD','RT','SFL','SFLU','SN','SNP','Sp']
columns_to_use_as_labels : list
List of strings for columns to use as labels. Examples are= ['class_DistFrPick_TopTarget','UWI','trainOrTest','TopTarget_DEPTH']
"""
def __init__(self):
#### intermediate files and paths
self.csv_of_well_names_wTopsCuves__name = ""
self.csv_of_well_names_wTopCurves__path = "."
#### Choices
self.must_have_curves_list = [""] # ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT']
self.curve_windows_for_rolling_features = [5, 7, 11, 21]
self.must_have_tops__list = [13000, 14000]
self.target_top = 13000
self.top_under_target = 14000
#### Column string names
self.top_name_col_in_picks_df = ""
self.siteID_col_in_picks_df = "SitID"
self.UWI = "UWI"
self.DEPTH_col_in_featureCreation = "DEPT"
self.HorID_name_col_in_picks_df = "HorID"
self.quality_col_name_in_picks_df = "Quality"
self.picks_depth_col_in_picks_df = "Pick"
self.col_topTarget_Depth_predBy_NN1thick = "topTarget_Depth_predBy_NN1thick"
self.quality_items_to_skip__list = [-1, 0]
self.test = "test0"
self.pick_class_str = "TopTarget_Pick_pred"
self.threshold_returnCurvesThatArePresentInThisManyWells = 2000
self.max_numb_wells_to_load = 1000000
self.split_traintest_percent = 0.8
self.kdtree_leaf = 2
self.kdtree_k = 8
self.rebalanceClassZeroMultiplier = 100
self.rebalanceClass95Multiplier = 40
self.NN1_topTarget_DEPTH = "NN1_topTarget_DEPTH"
self.NN1_TopHelper_DEPTH = "NN1_TopHelper_DEPTH"
self.trainOrTest = "trainOrTest"
self.colsToNotTurnToFloats = ["UWI", "SitID", "trainOrTest", "Neighbors_Obj"]
self.zonesAroundTops = {
"100": [0],
"95": [-0.5, 0.5],
"60": [-5, 0.5],
"70": [0.5, 5],
"0": [],
} #### NOTE: The code in createFeat_withinZoneOfKnownPick(df,config) function in features.py current ASSUMES only 5 zone labels
self.columns_to_not_trainOn_andNotCurves = [
"FromBotWell",
"FromTopWel" "rowsToEdge",
"lat",
"lng",
"SitID",
"TopHelper_HorID",
"TopTarget_HorID",
"TopHelper_DEPTH",
"diff_Top_Depth_Real_v_predBy_NN1thick",
"diff_TopTarget_DEPTH_v_rowDEPT",
"diff_TopHelper_DEPTH_v_rowDEPT",
"class_DistFrPick_TopHelper",
"NewWell",
"LastBitWell",
"TopWellDept",
"BotWellDept",
"WellThickness",
"rowsToEdge",
"closTopBotDist",
"closerToBotOrTop",
"Neighbors_Obj",
]
self.columns_to_not_trainOn_andAreCurves = [
"RHOB",
"SP",
"CALI",
"COND",
"DELT",
"DENS",
"DPHI:1",
"DPHI:2",
"DT",
"GR:1",
"GR:2",
"IL",
"ILD:1",
"ILD:2",
"ILM",
"LITH",
"LLD",
"LLS",
"PHID",
"PHIN",
"RESD",
"RT",
"SFL",
"SFLU",
"SN",
"SNP",
"Sp",
]
self.columns_to_use_as_labels = [
"class_DistFrPick_TopTarget",
"UWI",
"trainOrTest",
"TopTarget_DEPTH",
]
# self.results_path = "../results"
# self.availableData_path = "availableData"
#### only keep wells that have these curves
[docs] def set_must_have_curves(self, must_have_curves_in_list):
"""doc string goes here"""
self.must_have_curves_list = must_have_curves_in_list
print("must have curve list is: ", must_have_curves_in_list)
[docs] def get_must_have_curves(self, must_have_curves_in_list):
"""doc string goes here"""
return self.must_have_curves_list
#### only keep wells that have these tops
[docs] def set_must_have_tops__list(self, must_have_tops__list):
# [13000,14000]
self.must_have_tops__list = must_have_tops__list
print("set must_have_tops_list as: ", self.must_have_tops__list)
[docs] def get_must_have_tops__list(self):
# [13000,14000]
return self.must_have_tops__list
[docs] def set_quality_items_to_skip__list(self, quality_items_to_skip__list):
self.quality_items_to_skip__list = quality_items_to_skip__list
print("set quality_items_to_skip__list as: ", quality_items_to_skip__list)
[docs] def get_quality_items_to_skip__list(self):
return self.quality_items_to_skip__list
#### column names in picks_df
[docs] def set_top_name_col_in_picks_df(self, top_name_col_in_picks_df__str):
self.top_name_col_in_picks_df = top_name_col_in_picks_df__str
print(" set self.top_name_col_in_picks_df as: ", top_name_col_in_picks_df__str)
[docs] def set_siteID_col_in_picks_df(self, sitID__str):
self.siteID_col_in_picks_df = sitID__str
print(" set siteID_col_in_picks_df as: ", self.siteID_col_in_picks_df)
[docs] def get_siteID_col_in_picks_df(self):
return self.siteID_col_in_picks_df
[docs] def get_top_name_col_in_picks_df(self):
return self.top_name_col_in_picks_df
[docs] def set_quality_col_name_in_picks_df(self, Quality__str):
self.quality_col_name_in_picks_df = Quality__str
[docs] def get_quality_col_name_in_picks_df(self):
return self.quality_col_name_in_picks_df
[docs] def set_picks_depth_col_in_picks_df(self, picks_depth_col_in_picks_df):
self.picks_depth_col_in_picks_df = picks_depth_col_in_picks_df
[docs] def get_picks_depth_col_in_picks_df(self):
return self.picks_depth_col_in_picks_df
[docs]class output_data:
"""
A class to keep information related to where output files are saved and naming conventions.
This class can also makes all the directories for intermediate result files via its make_all_directories() function.
Types of information information stored in here would all the intermediate output file paths as you run different functions and modules of Predictatops.
The object created by this class is used throughout Predictatops, so many modules reimport it.
Be careful to not change something in one module close your code, start up later working with the next module and except your changes to persis unless you saved them or wrote them into the configurationplusfiles_runner.py file.
Parameters
----------
none:none
None.
Attributes
----------
default_results_file_format : str = ".h5"
A base path for all results. Example is '../results/'
path_checkData : str
A path string for the checkData directory. Example is 'checkData'
path_load : str
A path string for the load directory. Example is 'load'
path_split : str
A path string for the split directory. Example is 'split'
path_wellsKNN : str
A path string for the wellsKNN directory. Example is 'wellsKNN'
path_features : str
A path string for the features directory. Example is 'features'
path_balance : str
A path string for the balance directory. Example is 'balance'
path_trainclasses : str
A path string for the trainclasses directory. Example is 'trainclasses'
path_prediction :str
A path string for the prediction directory. Example is 'prediction'
path_evaluate : str
A path string for the evaluation directory. Example is 'evaluate'
path_map : str
A path string for the map directory. Example is 'map'
path_plot : str
A path string for the plot directory. Example is 'plot
loaded_results_wells_df : str
A path string for the loaded wells with top curves dataframe. Example is "loaded_wells_wTopsCurves"
split_results_wells_df : str
A path string for the loaded wells with top curves and splited dataframe. Example is "wells_wTopsCurvesSplits"
wellsKNN_results_wells_df : str
A path string for the loaded wells with top curves splitted and with KNN features dataframe. Example is "wells_wTopsCurvesSplitsKNN"
features_results_wells_df : str
A path string for the loaded wells with top curves splitted with KNN features and main features from features.py module dataframe. Example is "wells_wTopsCurvesSplitsKNNFeatures"
balance_results_wells_df : str
A path string for the loaded wells with top curves splitted and with KNN features and features from features.py and rebalanced classes dataframe. Example is "wells_wTopsCurvesSplitsKNNFeaturesBalance"
trainclasses_results_model : str
A path string for the trained model. Example is "model_trainclasses_wTopsCurvesSplitsKNNFeaturesBalance"
"""
def __init__(self):
#### paths to directories to store itermediate and final results
self.default_results_file_format = ".h5"
self.base_path_for_all_results = "../results/"
self.path_checkData = "checkData"
self.path_load = "load"
self.path_split = "split"
self.path_wellsKNN = "wellsKNN"
self.path_features = "features"
self.path_balance = "balance"
self.path_trainclasses = "trainclasses"
self.path_prediction = "prediction"
self.path_evaluate = "evaluate"
self.path_map = "map"
#### IF YOU ADD ANOTHER DIRECTORY TO THE ONES ABOVE, ADD IT TO THE LIST IN make_all_directories(self) FUNCTION !!! ####
self.loaded_results_wells_df = "loaded_wells_wTopsCurves"
self.split_results_wells_df = "wells_wTopsCurvesSplits"
self.wellsKNN_results_wells_df = "wells_wTopsCurvesSplitsKNN"
self.features_results_wells_df = "wells_wTopsCurvesSplitsKNNFeatures"
self.balance_results_wells_df = "wells_wTopsCurvesSplitsKNNFeaturesBalance"
self.trainclasses_results_model = (
"model_trainclasses_wTopsCurvesSplitsKNNFeaturesBalance"
)
[docs] def make_all_directories(self):
"""
A function that makes all the directories defined in the attributes of the output_data() class init function.
Examples of directories made include: [self.path_checkData,self.path_load,self.path_split,self.path_wellsKNN,self.path_features,
self.path_balance,self.path_trainclasses,self.path_prediction,self.path_evaluate,self.path_map]
Parameters
----------
none:none
None.
Return
------
none: none
The function does not return anything though it does print all the directories it creates, whether they already exist, and the base results directory created by running this function.
"""
print("making base folder for results in:", self.base_path_for_all_results)
list_of_sub_directories = [
self.path_checkData,
self.path_load,
self.path_split,
self.path_wellsKNN,
self.path_features,
self.path_balance,
self.path_trainclasses,
self.path_prediction,
self.path_evaluate,
self.path_map,
]
if not os.path.exists(self.base_path_for_all_results):
os.makedirs(self.base_path_for_all_results)
try:
os.makedirs(self.base_path_for_all_results)
except OSError as e:
print(e.errno)
# if e.errno != errno.EEXIST:
# raise
else:
print(
"base_path directory already exists,",
self.base_path_for_all_results,
" so not creating it again. This may or may not be what you intended, so just flagging it.",
)
for sub_dir in list_of_sub_directories:
if not os.path.exists(self.base_path_for_all_results + "/" + sub_dir):
os.makedirs(self.base_path_for_all_results + "/" + sub_dir)
try:
os.makedirs(self.base_path_for_all_results + "/" + sub_dir)
except OSError as e:
print(e.errno)
# if e.errno != errno.EEXIST:
# raise
else:
print(
"directory ",
sub_dir,
" already exists so not making it again in make_all_directories function of configurationplusfiles.py",
)
print(
"made directories for each step in the process. They should be in : ",
self.base_path_for_all_results,
)