#! /usr/bin/env python """ |------------------------------------------------------------------| |--------------------- Instructions -----------------------------| |------------------------------------------------------------------| This script is for modifying surface dataset at neon sites using data available from the neon server. After creating a single point surface data file from a global surface data file using subset_data.py, use this script to overwrite some fields with site-specific data for neon sites. This script will do the following: - Download neon data for the specified site if it does not exist in the specified directory : (i.e. ../../../neon_surf_files). - Modify surface dataset with downloaded data. ------------------------------------------------------------------- Instructions for running using conda python environments: ../../py_env_create conda activate ctsm_py ------------------------------------------------------------------- To see the available options: ./modify_singlept_site_neon.py --help ------------------------------------------------------------------- Example: ./modify_singlept_site_neon.py --neon_site PUUM --debug ------------------------------------------------------------------- """ # TODO (NS) # --[] If subset file not found run subset_data.py # --[] Download files only when available. # Import libraries from __future__ import print_function import argparse from datetime import date from getpass import getuser import glob import logging import os import sys import requests import numpy as np import pandas as pd import xarray as xr from packaging import version from ctsm.path_utils import path_to_ctsm_root myname = getuser() # Seconds to wait before requests.get() times out TIMEOUT = 60 # -- valid neon sites valid = glob.glob( os.path.join(path_to_ctsm_root(), "cime_config", "usermods_dirs", "NEON", "[!d]*") ) valid_neon_sites = [x[-4:] for x in valid] # last 4 letters in each string def get_parser(): """ Get parser object for this script. """ parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.print_usage = parser.print_help parser.add_argument( "--neon_site", help="4-letter neon site code.", action="store", dest="site_name", choices=valid_neon_sites, required=True, ) parser.add_argument( "--surf_dir", help=""" Directory of single point surface dataset. [default: %(default)s] """, action="store", dest="surf_dir", type=str, required=False, default="/glade/derecho/scratch/" + myname + "/single_point/", ) parser.add_argument( "--out_dir", help=""" Directory to write updated single point surface dataset. [default: %(default)s] """, action="store", dest="out_dir", type=str, required=False, default="/glade/derecho/scratch/" + myname + "/single_point_neon_updated/", ) parser.add_argument( "--inputdata-dir", help=""" Directory containing standard input files from CESM input data such as the surf_soildepth_file. [default: %(default)s] """, action="store", dest="inputdatadir", type=str, required=False, default="/glade/campaign/cesm/cesmdata/cseg/inputdata", ) parser.add_argument( "-d", "--debug", help=""" Debug mode will print more information. [default: %(default)s] """, action="store_true", dest="debug", default=False, ) parser.add_argument( "--16pft", help="Modify 16-pft surface data files (e.g. for a FATES run)", action="store_true", dest="pft_16", default=False, ) return parser def get_neon(neon_dir, site_name): """ Function for finding neon data files and download from neon server if the file does not exist. Args: neon_dir (str): local directory for downloading neon data. site_name (str): 4 letter neon site name Raises: Error if the download was not successful (exit code:404). In case the data does not exist in the neon server or if neon server is down. Returns: neon_file (str) : complete file name of the downloaded data """ # -- create directory if not exists if not os.path.exists(neon_dir): os.makedirs(neon_dir) neon_file = os.path.join(neon_dir, site_name + "_surfaceData.csv") # -- Download the file if it does not exist if os.path.isfile(neon_file): print("neon file for", site_name, "already exists! ") print("Skipping download from neon for", site_name, "...") else: print("------------------------------------------------") print("Beginning download from neon server for", site_name, "...") url = ( "https://s3.data.neonscience.org/neon-ncar/NEON/surf_files/v1/" + site_name + "_surfaceData.csv" ) response = requests.get(url, timeout=TIMEOUT) with open(neon_file, "wb") as a_file: a_file.write(response.content) # -- Check if download status_code if response.status_code == 200: print("Download finished successfully for", site_name) elif response.status_code == 404: sys.exit( "Data for this site " + site_name + " was not available on the neon server:" + url ) print("Download exit status code: ", response.status_code) print("Downloaded file type : ", response.headers["content-type"]) print("Downloaded file encoding : ", response.encoding) print("------------------------------------------------") response.close() return neon_file def find_surffile(surf_dir, site_name, pft_16): """ Function for finding and choosing surface file for a neon site. These files are created using ./subset_data.py script. In case multiple files exist for the neon site, it will choose the file created the latest. Args: surf_dir (str): directory of single point surface data site_name (str): 4 letter neon site name pft_16 (bool): if true, use 16-PFT version of surface data file Raises: Error if the surface data for the site is not created Returns: surf_file (str): name of the surface dataset file """ if pft_16: sf_name = "surfdata_1x1_NEON_" + site_name + "*hist_2000_16pfts*.nc" else: sf_name = "surfdata_1x1_NEON_" + site_name + "*hist_2000_78pfts*.nc" print(os.path.join(surf_dir, sf_name)) surf_file = sorted(glob.glob(os.path.join(surf_dir, sf_name))) if len(surf_file) > 1: print("The following files found :", *surf_file, sep="\n- ") print("The latest file is chosen :", surf_file[-1]) surf_file = surf_file[-1] elif len(surf_file) == 1: print("File found : ") print(surf_file) surf_file = surf_file[0] else: sys.exit( "Surface data for this site " + str(site_name) + " was not found:" + str(surf_dir) + str(sf_name) + "." + "\n" + "Please run ./subset_data.py for this site." ) return surf_file def find_soil_structure(args, surf_file): """ Function for finding surface dataset soil structure using surface data metadata. In CLM surface data, soil layer information is in a file from surface data metadata under "Soil_texture_raw_data_file_name". This function finds this file for the surface dataset, read it, and find soil layers. args: surf_file (str): single point surface data filename Raises: error if the soil layer strucutre file does not exist Returns: soil_bot : array of soil layers top depths soil_top : array of soil layers bottom depths """ # TODO: What if not cheyenne? Self-contained depth info. print("------------") print("surf_file : ", surf_file) f_1 = xr.open_dataset(surf_file) print("------------") # print (f_1.attrs["Soil_texture_raw_data_file_name"]) clm_input_dir = os.path.join(args.inputdatadir, "lnd/clm2/rawdata/") surf_soildepth_file = os.path.join( clm_input_dir, f_1.attrs["soil_texture_lookup_raw_data_file_name"] ) if os.path.exists(surf_soildepth_file): print( "\n\n Reading", surf_soildepth_file, "for surface data soil structure information:", ) f_1_soildepth = xr.open_dataset(surf_soildepth_file) print(f_1_soildepth["DZSOI"]) soil_bot = f_1_soildepth["DZSOI"].values # -- soil layer top soil_top = soil_bot[:-1] soil_top = np.insert(soil_top, 0, 0) else: sys.exit( "Cannot find soil structure file : " + surf_soildepth_file + "for the surface dataset." ) return soil_bot, soil_top def update_metadata(nc_file, surf_file, neon_file, zb_flag): """ Function for updating modified surface dataset metadata for neon sites. Args: nc_file (xr Dataset): netcdf file including updated neon surface data surf_file (str): single point surface data filename neon_file (str): filename of neon downloaded surface dataset zb_flag (bool): update bedrock Returns: nc_file (xr Dataset): netcdf file including updated neon surface data """ today = date.today() today_string = today.strftime("%Y-%m-%d") nc_file.attrs["Updated_on"] = today_string nc_file.attrs["Updated_by"] = myname nc_file.attrs["Updated_with"] = os.path.abspath(__file__) nc_file.attrs["Updated_from"] = surf_file nc_file.attrs["Updated_using"] = neon_file if zb_flag: nc_file.attrs["Updated_fields"] = "PCT_CLAY, PCT_SAND, ORGANIC, zbedrock" else: nc_file.attrs["Updated_fields"] = "PCT_CLAY, PCT_SAND, ORGANIC" return nc_file def update_time_tag(fname_in): """ Function for updating time tag on surface dataset files. Expects file to end with [._]cYYMMDD.nc or [._]YYMMDD.nc Add the tag to just before that ending part. Args: fname_in (str) : file name with the old time tag Raises: error if the file does not end with [._]cYYMMDD.nc or [._]YYMMDD.nc Returns: fname_out (str) : file name with the updated time tag """ today = date.today() today_string = today.strftime("%y%m%d") basename = os.path.basename(fname_in) cend = -10 if basename[cend] == "c": cend = cend - 1 if (basename[cend] != ".") and (basename[cend] != "_"): sys.exit("Trouble figuring out where to add tag to filename:" + fname_in) fname_out = basename[:cend] + "_" + "c" + today_string + ".nc" return fname_out def sort_print_soil_layers(obs_bot, soil_bot): """ Function for pretty printing soil structure of original surface dataset and neon dataset. Args: obs_bot : array of neon soil layers bottom depths soil_bot : array of soil layers bottom depths """ obs_bot_df = pd.DataFrame({"depth": obs_bot, "type": "obs"}) soil_bot_df = pd.DataFrame({"depth": soil_bot, "type": "sfc"}) depth_df = pd.concat([obs_bot_df, soil_bot_df]) depth_df = depth_df.sort_values("depth") space = " " print("================================", "================================") print(" Neon data soil structure: ", " Surface data soil structure: ") print("================================", "================================") for _, row in depth_df.iterrows(): if row["type"] == "obs": print("-------------", "{0:.3f}".format(row["depth"]), "------------") else: print( 33 * space + "-------------", "{0:.3f}".format(row["depth"]), "-----------", ) print("--------------------------------" + "--------------------------------") def check_neon_time(): """ A function to download and parse neon listing file. Returns: dict_out (str) : dictionary of *_surfaceData.csv files with the last modified """ listing_file = "listing.csv" url = "https://storage.neonscience.org/neon-ncar/listing.csv" download_file(url, listing_file) d_f = pd.read_csv(listing_file) d_f = d_f[d_f["object"].str.contains("_surfaceData.csv")] dict_out = dict(zip(d_f["object"], d_f["last_modified"])) print(dict_out) return dict_out def download_file(url, fname): """ Function to download a file. Args: url (str): url of the file for downloading fname (str) : file name to save the downloaded file. """ try: response = requests.get(url, timeout=TIMEOUT) with open(fname, "wb") as a_file: a_file.write(response.content) # -- Check if download status_code if response.status_code == 200: print("Download finished successfully for", fname, ".") elif response.status_code == 404: print("File " + fname + "was not available on the neon server:" + url) except Exception as err: print("The server could not fulfill the request.") print("Something went wrong in downloading", fname) raise err def fill_interpolate(f_2, var, method): """ Function to interpolate a variable in a xarray dataset a specific method """ print("=====================================") print("Filling in ", var, "with interpolation (method =" + method + ").") print("Variable before filling : ") print(f_2[var]) tmp_df = pd.DataFrame(f_2[var].values.ravel()) tmp_df = tmp_df.interpolate(method=method, limit_direction="both") tmp = tmp_df.to_numpy() soil_levels = f_2[var].size for soil_lev in range(soil_levels): f_2[var][soil_lev] = tmp[soil_lev].reshape(1, 1) print("Variable after filling : ") print(f_2[var]) print("=====================================") def print_neon_data_soil_structure(obs_bot, soil_bot, bin_index): """ Print info about NEON data soil structure """ print("================================") print(" Neon data soil structure: ") print("================================") print("------------", "ground", "------------") for i, this_obs_bot in enumerate(obs_bot): print("layer", i) print("-------------", "{0:.2f}".format(this_obs_bot), "-------------") print("================================") print("Surface data soil structure: ") print("================================") print("------------", "ground", "------------") for this_bin in range(len(bin_index)): print("layer", this_bin) print("-------------", "{0:.2f}".format(soil_bot[this_bin]), "-------------") def print_soil_quality( inorganic, bin_index, soil_lev, layer_depth, carbon_tot, estimated_oc, bulk_den, f_2 ): """ Prints information about soil quality """ print("~~~~~~~~~~~~~~~~~~~~~~~~") print("inorganic:") print("~~~~~~~~~~~~~~~~~~~~~~~~") print(inorganic) print("~~~~~~~~~~~~~~~~~~~~~~~~") print("bin_index : ", bin_index[soil_lev]) print("layer_depth : ", layer_depth) print("carbon_tot : ", carbon_tot) print("estimated_oc : ", estimated_oc) print("bulk_den : ", bulk_den) print("organic :", f_2["ORGANIC"][soil_lev].values) print("--------------------------") def update_agri_site_info(site_name, f_2): """ Updates agricultural sites """ ag_sites = ["KONA", "STER"] if site_name not in ag_sites: return f_2 print("Updating PCT_NATVEG") print("Original : ", f_2.PCT_NATVEG.values) f_2.PCT_NATVEG.values = [[0.0]] print("Updated : ", f_2.PCT_NATVEG.values) print("Updating PCT_CROP") print("Original : ", f_2.PCT_CROP.values) f_2.PCT_CROP.values = [[100.0]] print("Updated : ", f_2.PCT_CROP.values) print("Updating PCT_NAT_PFT") print(f_2.PCT_NAT_PFT.values[0]) print(f_2.PCT_NAT_PFT[0].values) return f_2 def update_fields_with_neon(f_1, d_f, bin_index): """ update fields with neon """ f_2 = f_1 soil_levels = f_2["PCT_CLAY"].size for soil_lev in range(soil_levels): print("--------------------------") print("soil_lev:", soil_lev) print(d_f["clayTotal"][bin_index[soil_lev]]) f_2["PCT_CLAY"][soil_lev] = d_f["clayTotal"][bin_index[soil_lev]] f_2["PCT_SAND"][soil_lev] = d_f["sandTotal"][bin_index[soil_lev]] bulk_den = d_f["bulkDensExclCoarseFrag"][bin_index[soil_lev]] carbon_tot = d_f["carbonTot"][bin_index[soil_lev]] estimated_oc = d_f["estimatedOC"][bin_index[soil_lev]] # -- estimated_oc in neon data is rounded to the nearest integer. # -- Check to make sure the rounded oc is not higher than carbon_tot. # -- Use carbon_tot if estimated_oc is bigger than carbon_tot. estimated_oc = min(estimated_oc, carbon_tot) layer_depth = ( d_f["biogeoBottomDepth"][bin_index[soil_lev]] - d_f["biogeoTopDepth"][bin_index[soil_lev]] ) # f_2["ORGANIC"][soil_lev] = estimated_oc * bulk_den / 0.58 # -- after adding caco3 by NEON: # -- if caco3 exists: # -- inorganic = caco3/100.0869*12.0107 # -- organic = carbon_tot - inorganic # -- else: # -- organic = estimated_oc * bulk_den /0.58 caco3 = d_f["caco3Conc"][bin_index[soil_lev]] inorganic = caco3 / 100.0869 * 12.0107 print("inorganic:", inorganic) if not np.isnan(inorganic): actual_oc = carbon_tot - inorganic else: actual_oc = estimated_oc f_2["ORGANIC"][soil_lev] = actual_oc * bulk_den / 0.58 print_soil_quality( inorganic, bin_index, soil_lev, layer_depth, carbon_tot, estimated_oc, bulk_den, f_2 ) return f_2 def main(): """modify_singlept_site_neon main function""" args = get_parser().parse_args() # -- debugging option if args.debug: logging.basicConfig(level=logging.DEBUG) # Check if pandas is a recent enough version pdvers = pd.__version__ if version.parse(pdvers) < version.parse("1.1.0"): sys.exit( """The pandas version in your python environment is too old, update to a newer version of pandas (>=1.1.0): version=%s""", pdvers, ) # file_time = check_neon_time() # -- specify site from which to extract data site_name = args.site_name # -- Look for surface data surf_dir = args.surf_dir surf_file = find_surffile(surf_dir, site_name, args.pft_16) # -- directory structure clone_dir = os.path.abspath(os.path.join(__file__, "../../../..")) neon_dir = os.path.join(clone_dir, "neon_surffiles") # -- download neon data if needed neon_file = get_neon(neon_dir, site_name) # -- Read neon data d_f = pd.read_csv(neon_file) # -- Read surface dataset files print("surf_file:", surf_file) f_1 = xr.open_dataset(surf_file) # -- Find surface dataset soil depth information soil_bot, soil_top = find_soil_structure(args, surf_file) # -- Find surface dataset soil levels # TODO: how? NS uses metadata on file to find # soil strucure # better suggestion by WW to write dzsoi to neon surface dataset # This todo needs to go to the subset_data soil_top = np.cumsum(soil_top) soil_bot = np.cumsum(soil_bot) soil_mid = 0.5 * (soil_bot - soil_top) + soil_top # print ("Cumulative sum of soil bottom depths :", sum(soil_bot)) obs_bot = d_f["biogeoBottomDepth"] / 100 # -- Mapping surface dataset and neon soil levels bins = d_f["biogeoTopDepth"] / 100 bin_index = np.digitize(soil_mid, bins) - 1 print_neon_data_soil_structure(obs_bot, soil_bot, bin_index) # -- update fields with neon f_2 = update_fields_with_neon(f_1, d_f, bin_index) # -- Interpolate missing values method = "linear" fill_interpolate(f_2, "PCT_CLAY", method) fill_interpolate(f_2, "PCT_SAND", method) fill_interpolate(f_2, "ORGANIC", method) # -- Update zbedrock if neon observation does not make it down to 2m depth rock_thresh = 2 zb_flag = False if obs_bot.iloc[-1] < rock_thresh: print("zbedrock is updated.") f_2["zbedrock"].values[:, :] = obs_bot.iloc[-1] zb_flag = True sort_print_soil_layers(obs_bot, soil_bot) # -- updates for ag sites update_agri_site_info(site_name, f_2) out_dir = args.out_dir # -- make out_dir if it does not exist if not os.path.exists(out_dir): os.makedirs(out_dir) # -- update time tag for the output file wfile = out_dir + update_time_tag(surf_file) # -- update netcdf metadata f_2 = update_metadata(f_2, surf_file, neon_file, zb_flag) print(f_2.attrs) f_2.to_netcdf(path=wfile, mode="w", format="NETCDF3_64BIT") print("Successfully updated surface data file for neon site(" + site_name + "):\n - " + wfile)