#!/bin/bash set -x . $1 BREAK_RANGE=/g/ssli/research/gmtk/scripts/breakRange.pl PMAE=/g/ssli/research/gmtk/scripts/pmae CP=/bin/cp GENERATE=/g/ssli/research/gmtk/scripts/generate_masterfile.pl if [ ! -s $EMOUT_FILE ]; then MAX_TRN_SENT=`expr $NUM_TRN_SENTS - 1` SENT_RNG="0:$MAX_TRN_SENT" SENT_RNGS=`$BREAK_RANGE -rng $SENT_RNG -n $EMTRAIN_CHUNKS` FOR_RANGE=`perl -e "print join(' ', 1..$EMTRAIN_CHUNKS)"` for i in $FOR_RANGE; do CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params $GENERATE_CHUNK_DTS $CUR_RNG $MISC_DIR $GENERATE $CUR_RNG $MASTER_FILE > $CUR_MASTER_FILE done MAX_EM_ITER_NO_LL=`perl -e "@mcvr = split(/\s+/, '$MCVR_ARRAY'); \\$num = @mcvr; print \\$num;"` if [ $MAX_EM_ITER_NO_LL -gt $MAX_EM_ITER ] ; then MAX_EM_ITER_NO_LL=$MAX_EM_ITER fi EM_ITER=$INIT_EM_ITER LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params"$EM_ITER".gmp if [ $EM_ITER -le 1 ]; then CUR_IN_FILE=$INITIAL_GMP else LAST_EM_ITER=`expr $EM_ITER - 1` CUR_IN_FILE=$MISC_DIR/learned_params"$LAST_EM_ITER".gmp fi while [ $EM_ITER -le $MAX_EM_ITER_NO_LL ]; do if [ $KEEP_ACC = "true" ] then ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_${EM_ITER}_@D.data else ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_@D.data fi LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params${EM_ITER}.gmp ( for i in $FOR_RANGE; do if [ $KEEP_ACC = "true" ] then ACCFILE=$MISC_DIR/acc_file_${EM_ITER}_$i.data else ACCFILE=$MISC_DIR/acc_file_$i.data fi CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params echo "$GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $CUR_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -random false -trrng $CUR_RNG -storeAccFile $ACCFILE -accFileIsBinary true $MISC_PARAMS " done ) | $PMAE "$NODES" | tee $MISC_DIR/emacc.EM${EM_ITER}.makefile | pmake -L $NUM_LOCAL -J $EMTRAIN_PARALLELISM -f - > $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_MCVR=`echo $MCVR_ARRAY | awk "{ print \\$$EM_ITER }"` CUR_MCSR=`echo $MCSR_ARRAY | awk "{ print \\$$EM_ITER }"` RNG=`echo $SENT_RNGS | awk "{ print \\$1 }"` FIRST_MASTER_FILE=$MISC_DIR/masterfile.$RNG.params $GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $FIRST_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -outputTrainableParameters $CUR_OUT_FILE -binOutputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -mcvr $CUR_MCVR -mcsr $CUR_MCSR -loadAccFile $ABSTRACT_ACC_FILE -loadAccRange "1:$EMTRAIN_CHUNKS" -trrng nil -accFileIsBinary true -llStoreFile $LL_OUT_FILE -random false -meanCloneSTDfrac $MEANCLONEFRAC -covarCloneSTDfrac $VARCLONEFRAC $MISC_PARAMS >> $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_IN_FILE=$CUR_OUT_FILE EM_ITER=`expr $EM_ITER + 1` done LAST_EM_ITER=`expr $EM_ITER - 1` LAST_LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${LAST_EM_ITER}.data LAST_LL=`cat $LAST_LL_OUT_FILE` MCVR=`echo $MCVR_ARRAY | awk "{ print \\$$MAX_EM_ITER_NO_LL }"` MCSR=`echo $MCSR_ARRAY | awk "{ print \\$$MAX_EM_ITER_NO_LL }"` CUR_LL_RATIO=100000000 CONVERGED=0 while [ $EM_ITER -le $MAX_EM_ITER ] && [ $CONVERGED -eq 0 ] ; do if [ $KEEP_ACC = "true" ] then ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_${EM_ITER}_@D.data else ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_@D.data fi LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params${EM_ITER}.gmp ( for i in $FOR_RANGE; do if [ $KEEP_ACC = "true" ] then ACCFILE=$MISC_DIR/acc_file_${EM_ITER}_$i.data else ACCFILE=$MISC_DIR/acc_file_$i.data fi CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params echo "$GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $CUR_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -random false -trrng $CUR_RNG -storeAccFile $ACCFILE -accFileIsBinary true $MISC_PARAMS" done ) | $PMAE "$NODES" | tee $MISC_DIR/emacc.EM${EM_ITER}.makefile | pmake -L $NUM_LOCAL -J $EMTRAIN_PARALLELISM -f - > $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 RNG=`echo $SENT_RNGS | awk "{ print \\$1 }"` FIRST_MASTER_FILE=$MISC_DIR/masterfile.$RNG.params $GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $FIRST_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -outputTrainableParameters $CUR_OUT_FILE -binOutputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -mcvr $MCVR -mcsr $MCSR -loadAccFile $ABSTRACT_ACC_FILE -loadAccRange "1:$EMTRAIN_CHUNKS" -trrng nil -accFileIsBinary true -llStoreFile $LL_OUT_FILE -random false -meanCloneSTDfrac $MEANCLONEFRAC -covarCloneSTDfrac $VARCLONEFRAC $MISC_PARAMS >> $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_LL=`cat $LL_OUT_FILE` LAST_LL_MAGNITUDE=`perl -e "if ($LAST_LL >= 0) { print $LAST_LL; } else { print -1*$LAST_LL; }"` CUR_LL_RATIO=`perl -e "print 100*($CUR_LL - $LAST_LL)/$LAST_LL_MAGNITUDE;"` CONVERGED=`perl -e "if ($CUR_LL_RATIO < 0) { print "0";} else { \\$abs = $CUR_LL_RATIO; if (\\$abs > $LOG_LIKE_THRESH) { print "0"; } else { print "1"; } }"` echo curr ll is $CUR_LL, last ll is $LAST_LL, ratio is $CUR_LL_RATIO, thresh is $LOG_LIKE_THRESH, converged is $CONVERGED LAST_LL=$CUR_LL; CUR_IN_FILE=$CUR_OUT_FILE EM_ITER=`expr $EM_ITER + 1` done echo ----------------------------------------------------- echo `date`: Stopped after `expr $EM_ITER - 1` iterations. echo Copying $CUR_OUT_FILE to $EMOUT_FILE echo ----------------------------------------------------- $CP $CUR_OUT_FILE $EMOUT_FILE echo `date`: Done training! Now go have a drink. else echo `date`: NOT EM Training, EM output .gmp file $EMOUT_FILE exists fi