Source code for predictatops.configurationplusfiles

# -*- coding: utf-8 -*-
"""

The a module sets up three objects from class functions. 

- input_data() establishes where data is loaded from.
- configuration() establishes various configuration variables used in the rest of the code.
- output_data() establishes where data is written to. 

These are intended to be changed by the configurationplusfiles_runner.py module.

"""

##### import statements
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

#%matplotlib inline
import welly
from welly import Well
import lasio
import glob
import pickle
import math
import os


##### import from other modules


##### Classes
[docs]class input_data: """ A class object that holds paths and other information related to input data such as log files location, top files, well information files, etc. Parameters ---------- picks_file_path: str A string for the file path to the file with all the pick names and depths. picks_delimiter_str: str The delimiter of the file that has all the picks. path_to_logs_str: str The path to the directory with all the well logs. """ def __init__(self, picks_file_path, picks_delimiter_str, path_to_logs_str): #### Default initiation = ('../../../SPE_006_originalData/OilSandsDB/PICKS.TXT','\t','../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS') #### Only things that are mandatory on initiation are below self.data_directory = "../data/Mannville_input_data/v0.0.3-alpha/mannville_demo_data" self.picks_file_path = ( picks_file_path ) #### example = '../../../SPE_006_originalData/OilSandsDB/PICKS.TXT' self.picks_delimiter_str = picks_delimiter_str #### example = '\t' self.picks_df = pd.read_csv(picks_file_path, delimiter=picks_delimiter_str) self.picks_dic = self.data_directory + "OilSandsDB/PICKS.TXT" self.picks_dic_file_path_delimiter = "\t" self.logs_path_to_folder = ( path_to_logs_str ) #### example = '../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS' #### non-mandatory attributes, defaults should work for the example dataset. Can be changed with set functions below self.wells_file_path = self.data_directory + "OilSandsDB/WELLS.TXT" self.wells_file_path_delimiter = "\t" self.gis_file_path = self.data_directory + "well_lat_lng.csv" self.gis_file_path_delimiter = "," self.gis_lat_col = "lat" self.gis_long_col = "lng" # wells_wTopsCuves_toLoad = 'WellNamesWithGivenTopsCurves_defaultFileName.pkl' #### for logs self.las_folder_path = self.data_directory + "OilSandsDB/Logs/" self.well_format = ".LAS" #### Technically optional but often used. #### GIS file is mandatory if you want to use information from nearby wells or well's general location. self.wells_df = None self.gis_df = None
[docs] def load_wells_file(self): """ load wells file into pandas dataframe """ self.wells_df = pd.read_csv( self.wells_file_path, delimiter=self.wells_file_path_delimiter ) return self.wells_df
[docs] def load_gis_file(self): """ load wells file into pandas dataframe """ self.gis_df = pd.read_csv( self.gis_file_path, delimiter=self.gis_file_path_delimiter ) return self.gis_df
[docs] def set_wells_file_path(self, wells_file_path_str, wells_file_delimiter): """ set wells file path as attribute of object and returns wells data frame using load_well_file. Can be txt, tsv, or csv""" self.wells_file_path = wells_file_path_str self.wells_file_path_delimiter = wells_file_delimiter return self.load_wells_file()
[docs] def set_gis_file_path(self, gis_file_path_str, gis_file_path_delimiter): """ set wells file path as attribute of object and returns wells data frame using load_well_file. Can be txt, tsv, or csv""" self.gis_file_path = gis_file_path_str self.gis_file_path_delimiter = gis_file_path_delimiter return self.load_gis_file()
[docs]class configuration: """ A class to keep configuration variables you might change between runs. That is why it has a large number of attributes listed below. Types of information information stored in here would mandetory curves or mandatory tops, column names, name of the top you're trying to predict, etc. The object created by this class is used throughout Predictatops, so many modules reimport it. Be careful to not change something in one module close your code, start up later working with the next module and except your changes to persis unless you saved them or wrote them into the configurationplusfiles_runner.py file. Parameters ---------- none:none None. Attributes ---------- csv_of_well_names_wTopsCuves__name : str csv_of_well_names_wTopsCuves__name csv_of_well_names_wTopCurves__path : str csv_of_well_names_wTopsCuves__name must_have_curves_list : list An array of strings that are curve names like ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'] curve_windows_for_rolling_features : list Array of integers like [5,7,11,21] must_have_tops__list : list An array of tops list that could be integers or strings like [13000,14000] target_top : str A string or integer like 1300 top_under_target : str A string or interger that is a top name and is the name of a top under the top you want to predict such as 14000 top_name_col_in_picks_df : str The top name as it appears in the picks dataframe siteID_col_in_picks_df : str The string for the siteID column in the picks dataframe like 'SitID' UWI : str The string for the UWI column like "UWI" DEPTH_col_in_featureCreation : str The string for the depth column like "DEPT" HorID_name_col_in_picks_df : str The string for the horizon ID column like "HorID" quality_col_name_in_picks_df : str The string for the quality of the pick column like "Quality" picks_depth_col_in_picks_df : str The string for the pick column name like 'Pick' col_topTarget_Depth_predBy_NN1thick : str The string for the top target depth predicted by nearest neighbor thickess like 'topTarget_Depth_predBy_NN1thick' quality_items_to_skip__list : str The array of the integers for the quality of wells to optionally skip as not good quality picks. An example is [-1,0] test : str Honestly forget what this is come back and find out but is should be "test0" pick_class_str : str String for the top taget pick prediction column like 'TopTarget_Pick_pred' threshold_returnCurvesThatArePresentInThisManyWells : int The integer for the number of wells a curve has to be present in to be kept for example 2000 max_numb_wells_to_load : int Max number of wells to load out of all the wells in the directory with wells. This is used for when you're testing. Example is 1000000 split_traintest_percent : float The percent in 0 to 1 terms for train vs. split. You give the percent to keep. example is 0.8 kdtree_leaf : int Levels of kdtree? default is 2 kdtree_k : int Integer for number of neighbors or K in k nearest neighbor code for finding nearby wells for each well. Default is 8 rebalanceClassZeroMultiplier : int When rebalancing class zero. The number of instances of class zero is duplicated by this times. Default is 100 rebalanceClass95Multiplier : int When rebalancing class zero. The number of instances of class 95 is duplicated by this times. Default is 40 NN1_topTarget_DEPTH : str The string used in the column that holds the depth of the top in the first nearest neighbor training well. For example 'NN1_topTarget_DEPTH' NN1_TopHelper_DEPTH : str Helper depth for calculations for NN1_topTarget_DEPTH. Example is "NN1_TopHelper_DEPTH" trainOrTest : str String for column that holds string of either train or test. Example is 'trainOrTest' colsToNotTurnToFloats : list List of columsn to not turn to floads during feature creation. Examples is ['UWI', 'SitID', 'trainOrTest','Neighbors_Obj'] zonesAroundTops : object An object of class lables and depths around top to create those classes in. Example is {"100":[0],"95":[-0.5,0.5],"60":[-5,0.5],"70":[0.5,5],"0":[]} #### NOTE: The code in createFeat_withinZoneOfKnownPick(df,config) function in features.py current ASSUMES only 5 zone labels columns_to_not_trainOn_andNotCurves : list List of strings for names of columns to not train on and are not curves. Example is ['FromBotWell','FromTopWel''rowsToEdge','lat','lng', 'SitID','TopHelper_HorID','TopTarget_HorID','TopHelper_DEPTH','diff_Top_Depth_Real_v_predBy_NN1thick','diff_TopTarget_DEPTH_v_rowDEPT','diff_TopHelper_DEPTH_v_rowDEPT','class_DistFrPick_TopHelper','NewWell','LastBitWell','TopWellDept','BotWellDept','WellThickness','rowsToEdge','closTopBotDist','closerToBotOrTop','Neighbors_Obj'] columns_to_not_trainOn_andAreCurves : list list of strings for columns to not train on that are curves. Example is ['RHOB','SP','CALI','COND','DELT','DENS','DPHI:1','DPHI:2','DT','GR:1','GR:2','IL','ILD:1','ILD:2','ILM','LITH','LLD','LLS','PHID','PHIN','RESD','RT','SFL','SFLU','SN','SNP','Sp'] columns_to_use_as_labels : list List of strings for columns to use as labels. Examples are= ['class_DistFrPick_TopTarget','UWI','trainOrTest','TopTarget_DEPTH'] """ def __init__(self): #### intermediate files and paths self.csv_of_well_names_wTopsCuves__name = "" self.csv_of_well_names_wTopCurves__path = "." #### Choices self.must_have_curves_list = [""] # ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'] self.curve_windows_for_rolling_features = [5, 7, 11, 21] self.must_have_tops__list = [13000, 14000] self.target_top = 13000 self.top_under_target = 14000 #### Column string names self.top_name_col_in_picks_df = "" self.siteID_col_in_picks_df = "SitID" self.UWI = "UWI" self.DEPTH_col_in_featureCreation = "DEPT" self.HorID_name_col_in_picks_df = "HorID" self.quality_col_name_in_picks_df = "Quality" self.picks_depth_col_in_picks_df = "Pick" self.col_topTarget_Depth_predBy_NN1thick = "topTarget_Depth_predBy_NN1thick" self.quality_items_to_skip__list = [-1, 0] self.test = "test0" self.pick_class_str = "TopTarget_Pick_pred" self.threshold_returnCurvesThatArePresentInThisManyWells = 2000 self.max_numb_wells_to_load = 1000000 self.split_traintest_percent = 0.8 self.kdtree_leaf = 2 self.kdtree_k = 8 self.rebalanceClassZeroMultiplier = 100 self.rebalanceClass95Multiplier = 40 self.NN1_topTarget_DEPTH = "NN1_topTarget_DEPTH" self.NN1_TopHelper_DEPTH = "NN1_TopHelper_DEPTH" self.trainOrTest = "trainOrTest" self.colsToNotTurnToFloats = ["UWI", "SitID", "trainOrTest", "Neighbors_Obj"] self.zonesAroundTops = { "100": [0], "95": [-0.5, 0.5], "60": [-5, 0.5], "70": [0.5, 5], "0": [], } #### NOTE: The code in createFeat_withinZoneOfKnownPick(df,config) function in features.py current ASSUMES only 5 zone labels self.columns_to_not_trainOn_andNotCurves = [ "FromBotWell", "FromTopWel" "rowsToEdge", "lat", "lng", "SitID", "TopHelper_HorID", "TopTarget_HorID", "TopHelper_DEPTH", "diff_Top_Depth_Real_v_predBy_NN1thick", "diff_TopTarget_DEPTH_v_rowDEPT", "diff_TopHelper_DEPTH_v_rowDEPT", "class_DistFrPick_TopHelper", "NewWell", "LastBitWell", "TopWellDept", "BotWellDept", "WellThickness", "rowsToEdge", "closTopBotDist", "closerToBotOrTop", "Neighbors_Obj", ] self.columns_to_not_trainOn_andAreCurves = [ "RHOB", "SP", "CALI", "COND", "DELT", "DENS", "DPHI:1", "DPHI:2", "DT", "GR:1", "GR:2", "IL", "ILD:1", "ILD:2", "ILM", "LITH", "LLD", "LLS", "PHID", "PHIN", "RESD", "RT", "SFL", "SFLU", "SN", "SNP", "Sp", ] self.columns_to_use_as_labels = [ "class_DistFrPick_TopTarget", "UWI", "trainOrTest", "TopTarget_DEPTH", ] # self.results_path = "../results" # self.availableData_path = "availableData" #### only keep wells that have these curves
[docs] def set_must_have_curves(self, must_have_curves_in_list): """doc string goes here""" self.must_have_curves_list = must_have_curves_in_list print("must have curve list is: ", must_have_curves_in_list)
[docs] def get_must_have_curves(self, must_have_curves_in_list): """doc string goes here""" return self.must_have_curves_list
#### only keep wells that have these tops
[docs] def set_must_have_tops__list(self, must_have_tops__list): # [13000,14000] self.must_have_tops__list = must_have_tops__list print("set must_have_tops_list as: ", self.must_have_tops__list)
[docs] def get_must_have_tops__list(self): # [13000,14000] return self.must_have_tops__list
[docs] def set_quality_items_to_skip__list(self, quality_items_to_skip__list): self.quality_items_to_skip__list = quality_items_to_skip__list print("set quality_items_to_skip__list as: ", quality_items_to_skip__list)
[docs] def get_quality_items_to_skip__list(self): return self.quality_items_to_skip__list
#### column names in picks_df
[docs] def set_top_name_col_in_picks_df(self, top_name_col_in_picks_df__str): self.top_name_col_in_picks_df = top_name_col_in_picks_df__str print(" set self.top_name_col_in_picks_df as: ", top_name_col_in_picks_df__str)
[docs] def set_siteID_col_in_picks_df(self, sitID__str): self.siteID_col_in_picks_df = sitID__str print(" set siteID_col_in_picks_df as: ", self.siteID_col_in_picks_df)
[docs] def get_siteID_col_in_picks_df(self): return self.siteID_col_in_picks_df
[docs] def get_top_name_col_in_picks_df(self): return self.top_name_col_in_picks_df
[docs] def set_quality_col_name_in_picks_df(self, Quality__str): self.quality_col_name_in_picks_df = Quality__str
[docs] def get_quality_col_name_in_picks_df(self): return self.quality_col_name_in_picks_df
[docs] def set_picks_depth_col_in_picks_df(self, picks_depth_col_in_picks_df): self.picks_depth_col_in_picks_df = picks_depth_col_in_picks_df
[docs] def get_picks_depth_col_in_picks_df(self): return self.picks_depth_col_in_picks_df
[docs]class output_data: """ A class to keep information related to where output files are saved and naming conventions. This class can also makes all the directories for intermediate result files via its make_all_directories() function. Types of information information stored in here would all the intermediate output file paths as you run different functions and modules of Predictatops. The object created by this class is used throughout Predictatops, so many modules reimport it. Be careful to not change something in one module close your code, start up later working with the next module and except your changes to persis unless you saved them or wrote them into the configurationplusfiles_runner.py file. Parameters ---------- none:none None. Attributes ---------- default_results_file_format : str = ".h5" A base path for all results. Example is '../results/' path_checkData : str A path string for the checkData directory. Example is 'checkData' path_load : str A path string for the load directory. Example is 'load' path_split : str A path string for the split directory. Example is 'split' path_wellsKNN : str A path string for the wellsKNN directory. Example is 'wellsKNN' path_features : str A path string for the features directory. Example is 'features' path_balance : str A path string for the balance directory. Example is 'balance' path_trainclasses : str A path string for the trainclasses directory. Example is 'trainclasses' path_prediction :str A path string for the prediction directory. Example is 'prediction' path_evaluate : str A path string for the evaluation directory. Example is 'evaluate' path_map : str A path string for the map directory. Example is 'map' path_plot : str A path string for the plot directory. Example is 'plot loaded_results_wells_df : str A path string for the loaded wells with top curves dataframe. Example is "loaded_wells_wTopsCurves" split_results_wells_df : str A path string for the loaded wells with top curves and splited dataframe. Example is "wells_wTopsCurvesSplits" wellsKNN_results_wells_df : str A path string for the loaded wells with top curves splitted and with KNN features dataframe. Example is "wells_wTopsCurvesSplitsKNN" features_results_wells_df : str A path string for the loaded wells with top curves splitted with KNN features and main features from features.py module dataframe. Example is "wells_wTopsCurvesSplitsKNNFeatures" balance_results_wells_df : str A path string for the loaded wells with top curves splitted and with KNN features and features from features.py and rebalanced classes dataframe. Example is "wells_wTopsCurvesSplitsKNNFeaturesBalance" trainclasses_results_model : str A path string for the trained model. Example is "model_trainclasses_wTopsCurvesSplitsKNNFeaturesBalance" """ def __init__(self): #### paths to directories to store itermediate and final results self.default_results_file_format = ".h5" self.base_path_for_all_results = "../results/" self.path_checkData = "checkData" self.path_load = "load" self.path_split = "split" self.path_wellsKNN = "wellsKNN" self.path_features = "features" self.path_balance = "balance" self.path_trainclasses = "trainclasses" self.path_prediction = "prediction" self.path_evaluate = "evaluate" self.path_map = "map" #### IF YOU ADD ANOTHER DIRECTORY TO THE ONES ABOVE, ADD IT TO THE LIST IN make_all_directories(self) FUNCTION !!! #### self.loaded_results_wells_df = "loaded_wells_wTopsCurves" self.split_results_wells_df = "wells_wTopsCurvesSplits" self.wellsKNN_results_wells_df = "wells_wTopsCurvesSplitsKNN" self.features_results_wells_df = "wells_wTopsCurvesSplitsKNNFeatures" self.balance_results_wells_df = "wells_wTopsCurvesSplitsKNNFeaturesBalance" self.trainclasses_results_model = ( "model_trainclasses_wTopsCurvesSplitsKNNFeaturesBalance" )
[docs] def make_all_directories(self): """ A function that makes all the directories defined in the attributes of the output_data() class init function. Examples of directories made include: [self.path_checkData,self.path_load,self.path_split,self.path_wellsKNN,self.path_features, self.path_balance,self.path_trainclasses,self.path_prediction,self.path_evaluate,self.path_map] Parameters ---------- none:none None. Return ------ none: none The function does not return anything though it does print all the directories it creates, whether they already exist, and the base results directory created by running this function. """ print("making base folder for results in:", self.base_path_for_all_results) list_of_sub_directories = [ self.path_checkData, self.path_load, self.path_split, self.path_wellsKNN, self.path_features, self.path_balance, self.path_trainclasses, self.path_prediction, self.path_evaluate, self.path_map, ] if not os.path.exists(self.base_path_for_all_results): os.makedirs(self.base_path_for_all_results) try: os.makedirs(self.base_path_for_all_results) except OSError as e: print(e.errno) # if e.errno != errno.EEXIST: # raise else: print( "base_path directory already exists,", self.base_path_for_all_results, " so not creating it again. This may or may not be what you intended, so just flagging it.", ) for sub_dir in list_of_sub_directories: if not os.path.exists(self.base_path_for_all_results + "/" + sub_dir): os.makedirs(self.base_path_for_all_results + "/" + sub_dir) try: os.makedirs(self.base_path_for_all_results + "/" + sub_dir) except OSError as e: print(e.errno) # if e.errno != errno.EEXIST: # raise else: print( "directory ", sub_dir, " already exists so not making it again in make_all_directories function of configurationplusfiles.py", ) print( "made directories for each step in the process. They should be in : ", self.base_path_for_all_results, )