KEEP_ACC="false" INIT_EM_ITER=1 CPU=linux GMTKEMTRAIN=/homes/chiaping/src/gmtk/tksrc/gmtkEMtrain WORKDIR=. PARAMS_DIR=$WORKDIR/PARAMS LEARNED_PARAMS_DIR=$WORKDIR/LEARNED_PARAMS MISC_DIR=$WORKDIR/MISC; if [! -d $MISC_DIR ]; then mkdir $MISC_DIR; fi OBS_FILE=$WORKDIR/DATA/train_multi.scp MASTER_FILE=$PARAMS_DIR/masterFileTrain.params INITIAL_GMP=$PARAMS_DIR/initialParams EMOUT_FILE=$LEARNED_PARAMS_DIR/learned_params_multi.gmp STRFILE=$PARAMS_DIR/aurora_train.str nV=1e20 ; split=1e-15; nS=1e20 MCVR_ARRAY="$nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV $nV" MCSR_ARRAY="$nS $nS $nS $nS $nS $nS $nS $split $nS $nS $nS $nS $nS $nS $nS $split $nS $nS $nS $nS $nS $nS $nS $split $nS $nS $nS $nS $nS $nS $nS $split $nS $nS $nS $nS $nS $nS $nS $nS" MEANCLONEFRAC=0.25 VARCLONEFRAC=0.0 DLINKCLONEFRAC=0.0 STREAM1_PARAMS=" -of1 $OBS_FILE -nf1 39 -fmt1 htk -iswp1 true " MISC_PARAMS="-dlinkCloneSTDfrac $DLINKCLONEFRAC" LABEL_FILE=$WORKDIR/labels/Multi08TR_sp.mlf UTT_FILE=$WORKDIR/labels/multi_train.list GENERATE_CHUNK_DTS="$WORKDIR/scripts/generate_chunk_dts ${LABEL_FILE} ${UTT_FILE}" NUM_TRN_SENTS=8440 MAX_EM_ITER=40 LOG_LIKE_THRESH=0.001 EMTRAIN_PARALLELISM=40 EMTRAIN_CHUNKS=80 NUM_LOCAL=0 NODES="bird1 OR bird2 OR bird3 OR bird4 OR bird5 OR bird6 OR bird7 OR bird8 OR bird9 OR bird10 OR bird11 OR bird12 OR bird13 OR bird14 OR bird16 OR bird17 OR bird18 OR bird19 OR bird20 OR bird21 OR bird22 OR bird23 OR bird24 OR bird25 OR bird26 OR bird27 OR bird28 OR bird33 OR bird34 OR bird35" #!/bin/bash set -x . $1 BREAK_RANGE=/g/ssli/research/gmtk/scripts/breakRange.pl PMAE=/g/ssli/research/gmtk/scripts/pmae CP=/bin/cp GENERATE=/g/ssli/research/gmtk/scripts/generate_masterfile.pl if [ ! -s $EMOUT_FILE ]; then MAX_TRN_SENT=`expr $NUM_TRN_SENTS - 1` SENT_RNG="0:$MAX_TRN_SENT" SENT_RNGS=`$BREAK_RANGE -rng $SENT_RNG -n $EMTRAIN_CHUNKS` FOR_RANGE=`perl -e "print join(' ', 1..$EMTRAIN_CHUNKS)"` for i in $FOR_RANGE; do CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params $GENERATE_CHUNK_DTS $CUR_RNG $MISC_DIR $GENERATE $CUR_RNG $MASTER_FILE > $CUR_MASTER_FILE done MAX_EM_ITER_NO_LL=`perl -e "@mcvr = split(/\s+/, '$MCVR_ARRAY'); \\$num = @mcvr; print \\$num;"` if [ $MAX_EM_ITER_NO_LL -gt $MAX_EM_ITER ] ; then MAX_EM_ITER_NO_LL=$MAX_EM_ITER fi EM_ITER=$INIT_EM_ITER LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params"$EM_ITER".gmp if [ $EM_ITER -le 1 ]; then CUR_IN_FILE=$INITIAL_GMP else LAST_EM_ITER=`expr $EM_ITER - 1` CUR_IN_FILE=$MISC_DIR/learned_params"$LAST_EM_ITER".gmp fi while [ $EM_ITER -le $MAX_EM_ITER_NO_LL ]; do if [ $KEEP_ACC = "true" ] then ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_${EM_ITER}_@D.data else ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_@D.data fi LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params${EM_ITER}.gmp ( for i in $FOR_RANGE; do if [ $KEEP_ACC = "true" ] then ACCFILE=$MISC_DIR/acc_file_${EM_ITER}_$i.data else ACCFILE=$MISC_DIR/acc_file_$i.data fi CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params echo "$GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $CUR_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -random false -trrng $CUR_RNG -storeAccFile $ACCFILE -accFileIsBinary true $MISC_PARAMS " done ) | $PMAE "$NODES" | tee $MISC_DIR/emacc.EM${EM_ITER}.makefile | pmake -L $NUM_LOCAL -J $EMTRAIN_PARALLELISM -f - > $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_MCVR=`echo $MCVR_ARRAY | awk "{ print \\$$EM_ITER }"` CUR_MCSR=`echo $MCSR_ARRAY | awk "{ print \\$$EM_ITER }"` RNG=`echo $SENT_RNGS | awk "{ print \\$1 }"` FIRST_MASTER_FILE=$MISC_DIR/masterfile.$RNG.params $GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $FIRST_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -outputTrainableParameters $CUR_OUT_FILE -binOutputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -mcvr $CUR_MCVR -mcsr $CUR_MCSR -loadAccFile $ABSTRACT_ACC_FILE -loadAccRange "1:$EMTRAIN_CHUNKS" -trrng nil -accFileIsBinary true -llStoreFile $LL_OUT_FILE -random false -meanCloneSTDfrac $MEANCLONEFRAC -covarCloneSTDfrac $VARCLONEFRAC $MISC_PARAMS >> $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_IN_FILE=$CUR_OUT_FILE EM_ITER=`expr $EM_ITER + 1` done LAST_EM_ITER=`expr $EM_ITER - 1` LAST_LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${LAST_EM_ITER}.data LAST_LL=`cat $LAST_LL_OUT_FILE` MCVR=`echo $MCVR_ARRAY | awk "{ print \\$$MAX_EM_ITER_NO_LL }"` MCSR=`echo $MCSR_ARRAY | awk "{ print \\$$MAX_EM_ITER_NO_LL }"` CUR_LL_RATIO=100000000 CONVERGED=0 while [ $EM_ITER -le $MAX_EM_ITER ] && [ $CONVERGED -eq 0 ] ; do if [ $KEEP_ACC = "true" ] then ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_${EM_ITER}_@D.data else ABSTRACT_ACC_FILE=$MISC_DIR/acc_file_@D.data fi LL_OUT_FILE=$MISC_DIR/loglikestorage.EM${EM_ITER}.data CUR_OUT_FILE=$MISC_DIR/learned_params${EM_ITER}.gmp ( for i in $FOR_RANGE; do if [ $KEEP_ACC = "true" ] then ACCFILE=$MISC_DIR/acc_file_${EM_ITER}_$i.data else ACCFILE=$MISC_DIR/acc_file_$i.data fi CUR_RNG=`echo $SENT_RNGS | awk "{ print \\$$i }"` CUR_MASTER_FILE=$MISC_DIR/masterfile.$CUR_RNG.params echo "$GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $CUR_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -random false -trrng $CUR_RNG -storeAccFile $ACCFILE -accFileIsBinary true $MISC_PARAMS" done ) | $PMAE "$NODES" | tee $MISC_DIR/emacc.EM${EM_ITER}.makefile | pmake -L $NUM_LOCAL -J $EMTRAIN_PARALLELISM -f - > $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 RNG=`echo $SENT_RNGS | awk "{ print \\$1 }"` FIRST_MASTER_FILE=$MISC_DIR/masterfile.$RNG.params $GMTKEMTRAIN $STREAM1_PARAMS $STREAM2_PARAMS $STREAM3_PARAMS -inputMasterFile $FIRST_MASTER_FILE -inputTrainableParameters $CUR_IN_FILE -binInputTrainableParameters false -outputTrainableParameters $CUR_OUT_FILE -binOutputTrainableParameters false -strFile $STRFILE -maxEmIters 1 -mcvr $MCVR -mcsr $MCSR -loadAccFile $ABSTRACT_ACC_FILE -loadAccRange "1:$EMTRAIN_CHUNKS" -trrng nil -accFileIsBinary true -llStoreFile $LL_OUT_FILE -random false -meanCloneSTDfrac $MEANCLONEFRAC -covarCloneSTDfrac $VARCLONEFRAC $MISC_PARAMS >> $MISC_DIR/pmake_emtrain.${EM_ITER}.out 2>&1 CUR_LL=`cat $LL_OUT_FILE` LAST_LL_MAGNITUDE=`perl -e "if ($LAST_LL >= 0) { print $LAST_LL; } else { print -1*$LAST_LL; }"` CUR_LL_RATIO=`perl -e "print 100*($CUR_LL - $LAST_LL)/$LAST_LL_MAGNITUDE;"` CONVERGED=`perl -e "if ($CUR_LL_RATIO < 0) { print "0";} else { \\$abs = $CUR_LL_RATIO; if (\\$abs > $LOG_LIKE_THRESH) { print "0"; } else { print "1"; } }"` echo curr ll is $CUR_LL, last ll is $LAST_LL, ratio is $CUR_LL_RATIO, thresh is $LOG_LIKE_THRESH, converged is $CONVERGED LAST_LL=$CUR_LL; CUR_IN_FILE=$CUR_OUT_FILE EM_ITER=`expr $EM_ITER + 1` done echo ----------------------------------------------------- echo `date`: Stopped after `expr $EM_ITER - 1` iterations. echo Copying $CUR_OUT_FILE to $EMOUT_FILE echo ----------------------------------------------------- $CP $CUR_OUT_FILE $EMOUT_FILE echo `date`: Done training! Now go have a drink. else echo `date`: NOT EM Training, EM output .gmp file $EMOUT_FILE exists fi