#!/bin/bash # AMPY Incremental tables metadata refresh script # John Bonifas # last updated 09-28-2020 # From: Fisher Cory D # Sent: Monday, September 28, 2020 8:52 AM # - The directory for an incremental should technically contain the get_max files (even if they arent being used), # while the directory for a full load does not. # - The DT partition fields AND the incremental fields are no longer being used as of 092820. # - In the case of tblphoenix_xml, the ID field is the incremental field. # So, the incremental field can be found in the get_max_table_ampy-[TABLENAME]_impala.hql, # "SELECT CAST(COALESCE(MAX(CAST(" line. # - The DT partition field can be found in the load_table_ampy-[TABLENAME]_impala.hql, "select distinct to_date(" line. AMPY=/development/etl/source/ampy IFS=$'\n' # record counter let "var=0" # initialize files echo tablename'@'tabletype'@'incrementalfield'@'partitionfield > incrementaltables_raw.csv echo '' > incrementaltables_errors.txt echo '' > incrementaltables_log.txt # https://stackoverflow.com/questions/21569172/how-to-list-only-the-file-names-in-hdfs # loop through the ampy script files for D in `hdfs dfs -ls $AMPY | sed 1d | perl -wlne'print +(split " ",$_,8)[7]'` ; do # removes path to get just the tablename THETABLE=`basename $D` # progress bar let "var++" echo working on table: $THETABLE ... $var | tee -a incrementaltables_log.txt; # https://stackoverflow.com/questions/17368067/length-of-string-in-bash # if it doesn't find a GET_MAX file it considers it a full load # https://stackoverflow.com/questions/24603037/binary-operator-expected-error-when-checking-if-a-file-with-full-pathname-exists # https://unix.stackexchange.com/questions/52800/how-to-do-an-if-statement-from-the-result-of-an-executed-command # As of the cluster operating system upgrades September 2020, if[] statement behavior has changed. # The following won't work anymore: # if [ -z `hdfs dfs -find $D -name 'get_max_table_ampy-${THETABLE}_impala.hql' ` ]; then if [[ -z `hdfs dfs -find $D -name '*impala.hql'` ]]; then echo $THETABLE is full load, skipping. | tee -a incrementaltables_log.txt; else echo $THETABLE is incremental, adding to flatfile... | tee -a incrementaltables_log.txt; # https://stackoverflow.com/questions/31107065/write-text-from-comand-line-into-hadoop/31122703 # https://unix.stackexchange.com/questions/88038/print-matching-line-and-nth-line-from-the-matched-line # in some files, there seems to be an extraneous carriage return, which inexplicably adds an extra comma, # so we pipe it thru sed to get rid of it # https://stackoverflow.com/questions/800030/remove-carriage-return-in-unix # the check column option and the get_max field must match. If the field is missing or wrong its an error. # It's ok for the check column and splitby fields to be different. # https://stackoverflow.com/questions/31306382/display-only-matched-string-with-grep # CHECKCOLUMN=`hdfs dfs -cat $D/import_argument_ampy-$THETABLE.txt | awk '/--check-column/{nr[NR+1]; next}; NR in nr' | sed 's/\r$//'` # GETMAX=`hdfs dfs -cat $D/get_max_table_ampy-${THETABLE}_impala.hql | grep -o "$CHECKCOLUMN"` # SPLITPART=`hdfs dfs -cat $D/import_argument_ampy-$THETABLE.txt | awk '/--split-by/{nr[NR+1]; next}; NR in nr' | sed 's/\r$//'` # echo $THETABLE','$CHECKCOLUMN','$GETMAX','$SPLITPART >> incrementaltables_raw.csv 2>>incrementaltables_errors.txt; # as of the Summer 2020 new code implementation, the checkcolumn and splitby fields have been removed. # This is the 'new' way of extracting the partition field. # https://www.geeksforgeeks.org/cut-command-linux-examples/ # https://stackoverflow.com/questions/45921699/bash-substring-from-first-occurrence-of-a-character-to-the-second-occurrence INCREMENTALFIELD=`hdfs dfs -cat $AMPY/$THETABLE/get_max_table_ampy-${THETABLE}_impala.hql | grep "SELECT CAST(COALESCE(MAX("` PARTITIONFIELD=`hdfs dfs -cat $AMPY/$THETABLE/load_table_ampy-${THETABLE}_impala.hql | grep "as dt"` echo $THETABLE'@Incremental@'$INCREMENTALFIELD'@'$PARTITIONFIELD >> incrementaltables_raw.csv 2>>incrementaltables_errors.txt; fi done