#!/bin/bash # AMPY Fix Varchar8000 data types and ${environment} variables # John Bonifas # last updated 05-13-2020 # | = pipe command output to another command # ` = run command first then pipe the output to the enclosing statement # sed and perl use regex # >> add stdout to a file # 2>> add stderr to a file # && do not move on to next statement until this one completes IFS=$'\n' let "var=0" TARGETFOLDER=/development/etl/source/ampy #TARGETFOLDER=/user/jbbonifa/test echo -e "\nProcessing $TARGETFOLDER ...\n" echo '' > varchar8000files_errors.txt; echo '' > varchar8000files_log.txt; # https://stackoverflow.com/questions/21569172/how-to-list-only-the-file-names-in-hdfs for D in `hdfs dfs -ls $TARGETFOLDER | sed 1d | perl -wlne'print +(split " ",$_,8)[7]'` ; do let "var++" THETABLE=`basename $D` # the tee utility allows output to both stdout and a log file echo working on table: $THETABLE ... $var | tee -a varchar8000files_log.txt; CREATENAME=$D/create_table_ampy-$THETABLE.hql; LOADNAME=$D/load_table_ampy-$THETABLE.hql; # here the THETABLE variable has to be forced to expand IMPALANAME=$D/load_table_ampy-${THETABLE}_impala.hql; CREATEQUERY=`hdfs dfs -cat $CREATENAME 2>>varchar8000files_errors.txt` && if echo $CREATEQUERY | grep -q "varchar(8000)" then # if you quote the variable it will leave the carriage returns in it echo "$CREATEQUERY" | sed 's/varchar(8000)/string/g' | hdfs dfs -put -f - $CREATENAME && hdfs dfs -chmod 775 $CREATENAME && echo "Fixed create." | tee -a varchar8000files_log.txt; RUNQUERY=`echo "$CREATEQUERY" | sed 's/\${environment}/dev_/g;s/\${directory}/development\/data/g;s/varchar(8000)/string/g'`; # https://impala.apache.org/docs/build/html/topics/impala_drop_table.html # If the table was created with the EXTERNAL clause, Impala leaves all files and directories untouched. impala-shell -k -i srphdw010 --quiet -q "use dev_ampy;drop table if exists $THETABLE purge;" && # in HUE, variable syntax is ${variable}, but in impala shell it is ${var:variable}, so you # cannot use the -var option here, you have to sed it in place, escaping the dollar sign and forward slash echo "$RUNQUERY" | impala-shell -k -i srphdw010 --quiet -f - && echo "Dropped and recreated table." | tee -a varchar8000files_log.txt; fi if hdfs dfs -cat $LOADNAME 2>>varchar8000files_errors.txt | grep -q "varchar(8000)" then hdfs dfs -cat $LOADNAME | sed 's/varchar(8000)/string/g' | hdfs dfs -put -f - $LOADNAME && hdfs dfs -chmod 775 $LOADNAME && echo "Fixed load." | tee -a varchar8000files_log.txt; fi if [ `hdfs dfs -find $D -name '*impala.hql'` ] then # the $ character needs to be escaped. if hdfs dfs -cat $IMPALANAME 2>>varchar8000files_errors.txt | grep -q -e '\${environment}' -e 'varchar(8000)' then # Each s...g regex sequence represents 1 replacement hdfs dfs -cat $IMPALANAME | sed 's/\${environment}/dev_ampy/g;s/varchar(8000)/string/g' | hdfs dfs -put -f - $IMPALANAME && hdfs dfs -chmod 775 $IMPALANAME && echo "Fixed Impala load." | tee -a varchar8000files_log.txt; fi fi done