#!/bin/bash

# AMPY get import statement parms for each table
# John Bonifas
# last updated 11-05-2020

AMPY=/development/etl/source/ampy;
PARAMETER=importstatementparms;

IFS=$'\n';

# record counter
let "var=0";

# initialize files
echo tablename','splitby','numberofmappers','fetchsize > ${PARAMETER}.csv;
echo '' > ${PARAMETER}_errors.txt;
echo '' > ${PARAMETER}_log.txt;

# https://stackoverflow.com/questions/21569172/how-to-list-only-the-file-names-in-hdfs
# loop through the ampy script files
for D in `hdfs dfs -ls $AMPY | sed 1d | perl -wlne'print +(split " ",$_,8)[7]'` ; do

  # removes path to get just the tablename
  THETABLE=`basename $D`;
  
  # progress bar
  let "var++"
  echo working on table: $THETABLE ... $var | tee -a ${PARAMETER}_log.txt;

  if `hdfs dfs -test -e $D/import_argument_ampy-$THETABLE.txt`; then 
    SPLITBY=`hdfs dfs -cat $D/import_argument_ampy-$THETABLE.txt | \
             awk '/--split-by/{nr[NR+1]; next}; NR in nr' | sed 's/\r$//'` 2>>${PARAMETER}_errors.txt; 

    NUMBEROFMAPPERS=`hdfs dfs -cat $D/import_argument_ampy-$THETABLE.txt | \
                     awk '/-m/{nr[NR+1]; next}; NR in nr' | sed 's/\r$//'` 2>>${PARAMETER}_errors.txt; 

    FETCHSIZE=`hdfs dfs -cat $D/import_argument_ampy-$THETABLE.txt | \
               awk '/--fetch-size/{nr[NR+1]; next}; NR in nr' | sed 's/\r$//'` 2>>${PARAMETER}_errors.txt; 

    echo $THETABLE','$SPLITBY','$NUMBEROFMAPPERS','$FETCHSIZE >> ${PARAMETER}.csv;
  fi

done