#!/bin/bash # AMPY Full Load tables metadata refresh script # John Bonifas # last updated 09-29-2020 # From: Fisher Cory D # Sent: Monday, September 28, 2020 8:52 AM # - The directory for an incremental should technically contain the get_max files (even if they arent being used), # while the directory for a full load does not. # - The DT partition fields AND the incremental fields are no longer being used as of 092820. # - The DT partition field can be found in the load_table_ampy-[TABLENAME]_impala.hql, "select distinct to_date(" line. AMPY=/development/etl/source/ampy IFS=$'\n' # record counter let "var=0" # initialize files echo tablename','tabletype','DTfilename > fullloadtables_raw.csv echo '' > fullloadtables_errors.txt echo '' > fullloadtables_log.txt # https://stackoverflow.com/questions/21569172/how-to-list-only-the-file-names-in-hdfs # loop through the ampy script folders for D in `hdfs dfs -ls $AMPY | sed 1d | perl -wlne'print +(split " ",$_,8)[7]'`; do # removes path to get just the tablename THETABLE=`basename $D` # progress bar let "var++" echo working on table: $THETABLE ... $var | tee -a fullloadtables_log.txt; # https://stackoverflow.com/questions/17368067/length-of-string-in-bash # if it doesn't find a GET_MAX file it considers it a full load # https://stackoverflow.com/questions/24603037/binary-operator-expected-error-when-checking-if-a-file-with-full-pathname-exists # https://unix.stackexchange.com/questions/52800/how-to-do-an-if-statement-from-the-result-of-an-executed-command # As of the cluster operating system upgrades September 2020, if[] statement behavior has changed. # The following won't work anymore: # if [ -z `hdfs dfs -find $D -name 'get_max_table_ampy-${THETABLE}_impala.hql' ` ]; then if [[ -z `hdfs dfs -find $D -name '*impala.hql'` ]]; then echo $THETABLE is full load, adding dt line if exists... | tee -a fullloadtables_log.txt; # loop through the files for E in `hdfs dfs -ls $D | sed 1d | perl -wlne'print +(split " ",$_,8)[7]'`; do if [[ `hdfs dfs -cat $E | grep "as dt"` ]]; then echo file: $E has a DT line. | tee -a fullloadtables_log.txt; echo $THETABLE',FullLoad,'$E >> incrementaltables_raw.csv 2>>fullloadtables_errors.txt; fi done else echo $THETABLE is incremental, skipping. | tee -a incrementables_log.txt; fi done