#!/usr/bin/perl -w use strict; ################################# # History: # # version 15: 26 Sept 2003 # * corrected a false error message # # version 14: 24 Sept 2003 # * uems to exclude any LEXEME of subtype un-lex # # version 13: 23 Sept 2003 # * always merge adjacent IPs # # version 12: 11 Sept 2003 # * bug fix # # version 11: 28 Aug 2003 # * renamed program as ag-to-rttm+uems reflecting new functionality # * added code to allow the program to use a list of AG "split files", # so that it inhales all the split files, and treats them all as # one AG file (basically by renaming all the annotations/anchors) # * added flag -I to specify that the program is to use a list of # AG "split files" rather than the name of an AG file # * added code to generate the scoring UEM file for speaker diarization # and the scoring UEM file for disfluencies and SU's. Options # -d and -s specify the names of these files # * added flag -x to have the scoring UEM files exclude overlap regions # # added flag -v to generate verbose progress messages, mostly about # the generation of the scoring UEM files # # version 10: 25 July 2003 # * flag zero duration tokens # * save the command line to the output file as comment # # version 9: 21 July 2003 # * added 'unannotated' as a valid SU subtype # # version 8: 8 July 2003 # * changed unknown gender label from 'other' to 'unknown' # * changed metadata subtype 'explicitEditingTerm' to 'explicit_editing_term' # * took out debug print statement # * fixed defective sort_sus subroutine # # version 7: 7 July 2003 # * added code to change channel value from 'ALL' to '1' # * fixed script to produce the appropriate lexeme class for a given lexeme # if such info exists # * made tokens with alpha subtype to have a period '.' after them # * made confidence score field to be required, output if we don't # have it # * normalized orthography of breath, cough, laughter, lipsmack, and sneeze noises # * undid the change to make SU to contain only tokens that are not # annotated as noRTMetadata # * fixed defective sort_ips subroutine # # version 6: 25 June 2003 # * fixed script to allow output of rttms to be optional # * added command line switch to normalize SWB1 transcription format # (i.e. 'because_1' to 'because') # * converted XML symbolics to normal characters # (i.e. '&' to '&') # * changed SU to contain only tokens that are not annotated as # noRTMetadata # # version 5: 23 June 2003 # * debug IP creation # # version 4: 18 June 2003 # * debug extract_stt_object # * debug/improve output sort # * expand LEXEME subtypes # # version 3: 16 June 2003 # * STT token type/subtype reconciled with MAIA documentation # * STT token category/text interpreted to match CTM subtypes # * SEGMENT records now accommodate speaker name # * field documentation added in leading comments # # version 2: 13 June 2003 # * basic debug complete # # version 1: 11 June 2003 # * rough draft # ################################# #GLOBAL VARIABLES my $dt_MaxSpkrMerge = 0.3; #maximum between-lex gap to merge speaker segments together my $dt_MaxIPMerge = 0.3; #maximum between-IP gap to merge two IP's into one. my %speaker_data; my $begNoscore; # explicit noscore region in AG file my $endNoscore; # explicit noscore region in AG file my $segID = "SG0000"; my $lexID = "LX0000"; my $nlxID = "NL0000"; my $nspID = "NS0000"; my $flrID = "FL0000"; my $edtID = "ED0000"; my $ipID = "IP0000"; my $suID = "SU0000"; my $apID = "AP0000"; my $spkrID = "SP0000"; my $noscoreID = "ZP0000"; my $noRTMetadataID = "NR0000"; my %sort_order = ("NOSCORE" => 0, "NO_RT_METADATA" => 1, "SEGMENT" => 2, "SPEAKER" => 3, "SU" => 4, "A/P" => 5, "CB" => 6, "IP" => 7, "EDIT" => 8, "FILLER" => 9, "NON-SPEECH" => 10, "NON-LEX" => 11, "LEXEME" => 12, "SPKR-INFO" => 13, "SUboundary" => 14); ################################# my ($date, $time) = date_time_stamp(); print "ag-to-rttm+uems run on $date at $time\n"; my $commandline = join(" ", @ARGV); print "command line: ", $0, " ", $commandline, "\n"; my $usage = "\n\n$0 -i -o -s \n\n". "Description: This Perl program converts EARS speech annotation\n". " from AG XML format to RTTM format and creates a scoring uem file\n". " for speaker-diarization and one for SU and disfluency scoring.\n". "\n". "Options:\n". " -c converts SWB1 transcription format to SNOR format.\n". " -x excludes overlap regions from the scoring uem file.\n". " -n includes the [non-standard] NOSCORE records in the rttm file\n". " -h prints this help message.\n". " -v causes verbose progress messages to be printed to STDOUT.\n". "Required arguments:\n". " -i is an AG file.\n". " -I is a text file listing multiple AG files, one per line.\n". "\t ... This option is intended to be used for a split ag file.\n". " -r is the RTTM output filename.\n". " -d is the disfluency and SU scoring-UEM output filename.\n". " -s is the speaker-diarization scoring-UEM output filename.\n"; ################################# #MAIN { use vars qw ($opt_i $opt_I $opt_d $opt_r $opt_s $opt_h $opt_c $opt_n $opt_x $opt_v); use Getopt::Std; getopts ('hcxnvd:i:I:r:s:'); die $usage if defined($opt_h); $opt_i or $opt_I or die "\n\nFATAL ERROR: undefined AG input file$usage"; $opt_d or die "\n\nFATAL ERROR: undefined SU and disfluency scoring UEM output file$usage"; $opt_s or die "\n\nFATAL ERROR: undefined speaker-diarization scoring-UEM output file$usage"; $opt_r or die "\n\nFATAL ERROR: undefined RTTM output file$usage"; my ($data, %signals, %times, %annotations, %channels, $spkr, $rttm); my @spkr_uemsegs = (); my @disfl_uemsegs = (); #get AG data if ($opt_I) { open AGLIST, $opt_I or die "\n\nFATAL ERROR: unable to open AG file list '$opt_I'$usage"; my $aglist_idx = 0; while (my $curr_ag = ) { chomp $curr_ag; next if $curr_ag =~ /^$/; print "Processing $curr_ag\n" if $opt_v; open DATA, $curr_ag or die "\n\n$usage\nFATAL ERROR: unable to open AG file '$curr_ag'"; $data = (); $data .= $_ while ; close DATA; # Extract AG data get_ag_signals ($aglist_idx, \%signals, $data); print "\t have signals\n" if $opt_v; get_ag_times ($aglist_idx, \%times, $data); print "\t have times\n" if $opt_v; get_ag_annotations ($aglist_idx, \%annotations, $data); print "\t have annotations\n" if $opt_v; normalize_swb1_text (\%annotations) if $opt_c; print "\t text has been normalized\n" if $opt_c and $opt_v; $aglist_idx += 1; } close AGLIST; } else { open DATA, $opt_i or die "\n\nFATAL ERROR: unable to open AG file '$opt_i'$usage"; $data .= $_ while ; close DATA; # Extract AG data get_ag_signals (0, \%signals, $data); print "\t have signals\n" if $opt_v; get_ag_times (0, \%times, $data); print "\t have times\n" if $opt_v; get_ag_annotations (0, \%annotations, $data); normalize_swb1_text (\%annotations) if $opt_c; print "\t text has been normalized\n" if $opt_c and $opt_v; } #format RTTM data create_pseudo_times (\%signals, \%times, \%annotations); print "Have created pseudo-times. Doing ag_to_rttm\n" if $opt_v; check_token_durations(\%times, \%annotations); $rttm = [ag_to_rttm (\%signals, \%times, \%annotations, \%channels)]; print "Have collected rttm data\n" if $opt_v; @$rttm = sort sort_objects @$rttm; print "Have sorted the rttm data\n" if $opt_v; foreach $spkr (keys %speaker_data) { print "\t$spkr\n" if $opt_v; add_IPs ($spkr, $rttm); print "\t\thave IPs\n" if $opt_v; add_SUs ($spkr, $rttm); print "\t\thave SUs\n" if $opt_v; add_speaker_diarization ($spkr, $rttm); print "\t\thave diarization\n" if $opt_v; } @$rttm = sort sort_objects @$rttm; print "Have sorted the rttm data again\n" if $opt_v; foreach my $chnl (keys %channels) { generate_scoring_uems ($rttm, \@spkr_uemsegs, \@disfl_uemsegs, $chnl); } print "Have generated the scoring uem data\n" if $opt_v; #output RTTM data print "Putting out rttm file\n" if $opt_v; output_rttm_object_file ($rttm, $opt_r); print "\tDone putting out rttm file.\n" if $opt_v; #output scoring-UEM files print "Putting out speaker-diarization scoring-uem file\n" if $opt_v; output_speaker_diarization_scoring_uem_file(@spkr_uemsegs); print "Done putting out speaker-diarization scoring-uem file\n" if $opt_v; print "Putting out disfluency-and-su scoring-uem file\n" if $opt_v; output_disfluency_and_su_scoring_uem_file(@disfl_uemsegs); print "Done putting out speaker-diarization scoring-uem file\n" if $opt_v; print "Program exiting normally.\n"; } ################################# sub output_speaker_diarization_scoring_uem_file { open DATA, ">$opt_s" or die "\n\nFATAL ERROR: unable to open scoring UEM output file $opt_s"; foreach my $uemseg (sort sort_uems @_) { print "Spkr seg\n" if $opt_v; print "\tfile $uemseg->{file}\n" if $opt_v; print "\tchnl $uemseg->{chnl}\n" if $opt_v; print "\ttbeg $uemseg->{tbeg}\n" if $opt_v; print "\ttend $uemseg->{tend}\n" if $opt_v; print "\ttdur $uemseg->{tdur}\n" if $opt_v; printf DATA "$uemseg->{file} $uemseg->{chnl} %.3f %.3f\n", $uemseg->{tbeg}, $uemseg->{tend}; } close DATA; } ################################# sub output_disfluency_and_su_scoring_uem_file { open DATA, ">$opt_d" or die "\n\nFATAL ERROR: unable to open scoring UEM output file $opt_d"; foreach my $uemseg (sort sort_uems @_) { print "Disfl seg\n" if $opt_v; print "\tfile $uemseg->{file}\n" if $opt_v; print "\tchnl $uemseg->{chnl}\n" if $opt_v; print "\ttbeg $uemseg->{tbeg}\n" if $opt_v; print "\ttend $uemseg->{tend}\n" if $opt_v; print "\ttdur $uemseg->{tdur}\n" if $opt_v; printf DATA "$uemseg->{file} $uemseg->{chnl} %.3f %.3f\n", $uemseg->{tbeg}, $uemseg->{tend}; } close DATA; } ################################# sub get_ag_signals { my ($id_qual, $signals, $data) = @_; my ($tag, $span, $id, $chnl, $file); while (($span, $data) = extract_sgml_span ("timeline", $data)) { (($tag) = extract_sgml_tag_and_span ("signal", $span)) or die "\n\nFATAL ERROR: no 'signal' in '$span'\n\n"; (($id) = extract_sgml_tag_attribute ("id", $tag)) or die "\n\nFATAL ERROR: no 'id' in '$tag'\n\n"; $id .= ":$id_qual"; (($chnl) = extract_sgml_tag_attribute ("track", $tag)) or die "\n\nFATAL ERROR: no 'track' in '$tag'\n\n"; $chnl = 1 if ($chnl eq "ALL"); # this is a hack for bad data $signals->{$id}{chnl} = $chnl; (($file) = extract_sgml_tag_attribute ("xlink:href", $tag)) or die "\n\nFATAL ERROR: no 'xlink:href' in '$tag'\n\n"; $file =~ s/.*\///; $file =~ s/\..*//; $signals->{$id}{file} = $file; } %$signals or die "\n\nFATAL ERROR: no signals found in get_ag_signals\n\n"; # return {%signals}; } ################################# sub get_ag_times { my ($id_qual, $times, $data) = @_; my ($tag, $span, $id, $time, $signal); while (($tag, $span, $data) = extract_sgml_tag_and_span ("anchor", $data)) { (($id) = extract_sgml_tag_attribute ("id", $tag)) or die "\n\nFATAL ERROR: no 'id' in '$tag'\n\n"; $id .= ":$id_qual"; (($signal) = extract_sgml_tag_attribute ("signals", $tag)) or die "\n\nFATAL ERROR: no 'signals' in '$tag'\n\n"; $signal .= ":$id_qual"; $times->{$id}{signal} = $signal; ($time) = extract_sgml_tag_attribute ("offset", $tag); $times->{$id}{time} = $time if defined $time; } %$times or die "\n\nFATAL ERROR: no times found in get_ag_times\n\n"; #return {%times}; } ################################# sub get_ag_annotations { my ($id_qual, $annotations, $data) = @_; my ($tag, $span, $id, $type, $start, $end, $name, $feature); while (($tag, $span, $data) = extract_sgml_tag_and_span ("annotation", $data)) { (($id) = extract_sgml_tag_attribute ("id", $tag)) or die "\n\nFATAL ERROR: no 'id' in '$tag'\n\n"; $id .= ":$id_qual"; $annotations->{$id}{id_qual} = $id_qual; (($type) = extract_sgml_tag_attribute ("type", $tag)) or die "\n\nFATAL ERROR: no 'type' in '$tag'\n\n"; $annotations->{$id}{type} = $type; (($start) = extract_sgml_tag_attribute ("start", $tag)) or die "\n\nFATAL ERROR: no 'start' in '$tag'\n\n"; $start .= ":$id_qual"; $annotations->{$id}{start} = $start; (($end) = extract_sgml_tag_attribute ("end", $tag)) or die "\n\nFATAL ERROR: no 'end' in '$tag'\n\n"; $end .= ":$id_qual"; $annotations->{$id}{end} = $end; while (($tag, $feature, $span) = extract_sgml_tag_and_span ("feature", $span)) { (($name) = extract_sgml_tag_attribute ("name", $tag)) or die "\n\nFATAL ERROR: no 'name' in '$tag'\n\n"; $annotations->{$id}{features}{$name} = $feature; } } %$annotations or die "\n\nFATAL ERROR: no annotations found in get_ag_annotations\n\n"; #return {%annotations}; } ################################# sub create_pseudo_times { my ($signals, $times, $annotations) = @_; my ($id, $id_qual, $start, $end, $signal, $tbeg, $tend, $time, $dt, $token, $tokens, @tokens); #interpolate times for all tokens in all segments foreach $id (keys %$annotations) { $id_qual = $annotations->{$id}{id_qual}; next unless $annotations->{$id}{type} eq "segment"; $start = $annotations->{$id}{start}; $times->{$start} or die "\n\nFATAL ERROR: no time info for segment start '$start'\n\n"; $signal = $times->{$start}{signal}; $signal .= ":$id_qual"; $end = $annotations->{$id}{end}; $times->{$end} or die "\n\nFATAL ERROR: no time info for segment end '$end'\n\n"; $signal eq ($times->{$end}{signal} . ":$id_qual") or die "\n\nFATAL ERROR: inconsistent start/end signals $signal $times->{$end}{signal} :$id_qual for segment '$id'\n\n"; $tbeg = $times->{$start}{time} or die "\n\nFATAL ERROR: no begin time for segment '$id'\n\n"; $tend = $times->{$end}{time} or die "\n\nFATAL ERROR: no end time for segment '$id'\n\n"; # segments may or may not have children tokens # if (exists($annotations->{$id}{features}{_AtlasAnnChil_})) { $tokens = $annotations->{$id}{features}{_AtlasAnnChil_}; #$tokens =~ /token/ or # die "\n\nFATAL ERROR: children not tokens for segment '$id'\n\n"; $tokens =~ s/.*token *//; @tokens = split /\s+/, $tokens; $time = $tbeg; $dt = ($tend-$tbeg)/@tokens; foreach $token (@tokens) { $token .= ":$id_qual"; $annotations->{$token} or die "\n\nFATAL ERROR: no annotation for token '$token'\n\n"; $start = $annotations->{$token}{start}; $times->{$start} or die "\n\nFATAL ERROR: no time info for token '$token' start '$start'\n\n"; $signal = $times->{$start}{signal}; $end = $annotations->{$token}{end}; $times->{$end} or die "\n\nFATAL ERROR: no time info for token '$token' end '$end'\n\n"; $signal eq $times->{$end}{signal} or die "\n\nFATAL ERROR: inconsistent start/end signals for start '$start' and end '$end'\n\n"; if (not defined $times->{$start}{time}) { $times->{$start}{time} = $time; $times->{$start}{fake} = "true"; } $time += $dt; if (not defined $times->{$end}{time}) { $times->{$end}{time} = $time; $times->{$end}{fake} = "true"; } } } } #check for complete coverage foreach $time (keys %$times) { defined $times->{$time}{time} or die "\n\nFATAL ERROR: time not defined for anchor '$time'\n\n"; } } ################################# sub check_token_durations { my ($times, $annotations) = @_; my ($id, $start, $end, $tbeg, $tend, $fatal); $fatal = 0; foreach $id (keys %$annotations) { next unless $annotations->{$id}{type} eq "token"; $start = $annotations->{$id}{start}; $end = $annotations->{$id}{end}; $times->{$start} or die "\n\nFATAL ERROR: no time info for token start '$start'\n\n"; $times->{$end} or die "\n\nFATAL ERROR: no time info for token end '$end'\n\n"; $tbeg = $times->{$start}{time} or die "\n\nFATAL ERROR: no begin time for token '$id'\n\n"; $tend = $times->{$end}{time} or die "\n\nFATAL ERROR: no end time for token '$id'\n\n"; if ($tend - $tbeg <= 0.0001) { # print all the errors and then die later $annotations->{$id}{features}{text} ? print "FATAL ERROR: token '$annotations->{$id}{features}{text}' at '$tbeg' has zero duration\n" : print "FATAL ERROR: token at '$tbeg' has zero duration\n"; $fatal = 1; } } if ($fatal) { die }; } ################################# sub get_id_qual { my ($id) = @_; my @fields = split(/:/, $id); return @fields[@fields - 1]; } ################################# sub ag_to_rttm { my ($signals, $times, $annotations, $channels) = @_; my ($id, $annotation, $type, $features, $object, $start, $end, $signal, @objects); #add speaker information foreach $id (keys %$annotations) { $annotation = $annotations->{$id}; $type = $annotation->{type}; propagate_speaker_info (get_id_qual($id), $annotations, $annotation) if $type eq "speaker"; } print "Done with speaker info\n" if $opt_v; #flag unclear transcriptions foreach $id (keys %$annotations) { $annotation = $annotations->{$id}; $type = $annotation->{type}; propagate_unclear_info (get_id_qual($id), $annotations, $annotation) if $type =~ /^(unclear|questionableTranscription)$/; } print "Done with unclear transcriptions\n" if $opt_v; #flag filled pauses foreach $id (keys %$annotations) { $annotation = $annotations->{$id}; $type = $annotation->{type}; propagate_fp_info (get_id_qual($id), $annotations, $annotation) if $type eq "filledPause"; } print "Done with filled pauses\n" if $opt_v; #flag no metadata foreach $id (keys %$annotations) { $annotation = $annotations->{$id}; $type = $annotation->{type}; propagate_nometadata_info(get_id_qual($id), $annotations, $annotation) if $type eq "noRTMetadata"; } print "Done with noRTMetadata\n" if $opt_v; #extract RTTM objects foreach $id (keys %$annotations) { $annotation = $annotations->{$id}; $type = $annotation->{type}; $features = $annotation->{features}; $object = ($type eq "speaker" ? extract_spkr_object ($annotation->{features}) : ($type eq "noscore" ? extract_noscore_object ($annotation->{features}) : ($type eq "noRTMetadata" ? extract_no_rt_metadata_object ($annotation->{features}) : ($type eq "segment" ? extract_segment_object ($annotation->{features}) : ($type eq "token" ? extract_stt_object ($annotation->{features}) : ($type =~ /^(SU|depod|filledPause|discourseMarker|explicitEditingTerm|aside)$/ ? $object = extract_mde_object ($type, $annotation->{features}) : undef)))))); next unless $object; $start = $annotations->{$id}{start}; $times->{$start} or die "\n\nFATAL ERROR: no time info for '$start'\n\n"; $signal = $times->{$start}{signal}; $end = $annotations->{$id}{end}; $times->{$end} or die "\n\nFATAL ERROR: no time info for '$end'\n\n"; ($signal eq $times->{$end}{signal}) or die "\n\nFATAL ERROR: inconsistent start/end signals for annotation '$id'\n\n"; $object->{file} = $signals->{$signal}{file}; $object->{chnl} = $signals->{$signal}{chnl}; $object->{tbeg} = $times->{$start}{time}; $object->{tbegValid} = "true" unless $times->{$start}{fake}; $object->{tend} = $times->{$end}{time}; $object->{tendValid} = "true" unless $times->{$end}{fake}; $object->{tdur} = $object->{tend} - $object->{tbeg}; $object->{features} = $annotations->{$id}{features}; if ($type eq "speaker") { my $name = $object->{name}; $speaker_data{$name}{type} = "SPKR-INFO"; $speaker_data{$name}{name} = $name; $speaker_data{$name}{stype} = $object->{stype}; $speaker_data{$name}{file} = $object->{file}; $speaker_data{$name}{chnl} = $object->{chnl}; $channels->{$object->{chnl}} = 1; $speaker_data{$name}{features} = $object->{features}; } else { push @objects, $object; } } return @objects; } ################################# sub propagate_speaker_info { my ($id_qual, $annotations, $parent) = @_; my ($children, $spkr, $child, $annotation); $children = $parent->{features}{_AtlasAnnChil_}; return unless $children; $spkr = $parent->{features}{speakerID}; foreach $child (split /\s+/, $children) { $child .= ":$id_qual"; $annotation = $annotations->{$child}; next unless $annotation; $annotation->{features}{speakerID} = $spkr; propagate_speaker_info ($id_qual, $annotations, $annotation); } } ################################# sub propagate_unclear_info { my ($id_qual, $annotations, $parent) = @_; my ($children, $child, $annotation); $children = $parent->{features}{_AtlasAnnChil_}; return unless $children; foreach $child (split /\s+/, $children) { $child .= ":$id_qual"; $annotation = $annotations->{$child}; next unless $annotation; $annotation->{features}{unclear} = "true"; propagate_unclear_info ($annotations, $annotation); } } ################################# sub propagate_fp_info { my ($id_qual, $annotations, $parent) = @_; my ($children, $child, $annotation); $children = $parent->{features}{_AtlasAnnChil_}; return unless $children; foreach $child (split /\s+/, $children) { $child .= ":$id_qual"; $annotation = $annotations->{$child}; next unless $annotation; next unless $annotation->{type} eq "token"; $annotation->{features}{category} = "filledPause"; propagate_fp_info ($annotations, $annotation); } } ################################# sub propagate_nometadata_info { my ($id_qual, $annotations, $parent) = @_; my ($children, $child, $annotation); $children = $parent->{features}{_AtlasAnnChil_}; return unless $children; foreach $child (split /\s+/, $children) { $annotation = $annotations->{$child}; next unless $annotation; next unless $annotation->{type} eq "token"; $annotation->{features}{nometadata} = "true"; propagate_nometadata_info ($annotations, $annotation); } } ################################# sub extract_spkr_object { my ($features) = @_; my ($name, $subtype, %object); ($name = $features->{speakerID}) or die "\n\nFATAL ERROR: speaker annotation without a defined 'speakerID'\n\n"; #(not $speaker_data{$name}) # or die "\n\nFATAL ERROR: multiple definitions of speakerID '$name'\n\n"; $subtype = lc $features->{type}; $subtype = "unknown" if not $subtype; $subtype = "adult_male" if $subtype eq "male"; $subtype = "adult_female" if $subtype eq "female"; $subtype = "unknown" if $subtype !~ /^(adult_male|adult_female|child)$/; $object{type} = "SPKR-INFO"; $object{stype} = $subtype; $object{name} = $name; return {%object}; } ################################# sub extract_noscore_object { my (%object); $object{type} = "NOSCORE"; $object{id} = ++$noscoreID; return {%object}; } ################################# sub extract_no_rt_metadata_object { my (%object); $object{type} = "NO_RT_METADATA"; $object{id} = ++$noRTMetadataID; return {%object}; } ################################# sub extract_segment_object { my ($features) = @_; my (%object); $object{type} = "SEGMENT"; $object{id} = ++$segID; $object{name} = $features->{speakerID}; return {%object}; } ################################# sub extract_stt_object { my ($features) = @_; my ($text, $category, $subtype, $type, %object); $text = lc $features->{text}; $category = lc $features->{category}; ($type, $subtype) = ((not $category) ? ("LEXEME" , "lex" ) : ($category =~ /(^|;\s*)filledpause/ ? ("LEXEME" , "fp" ) : ($category =~ /(^|;\s*)(prefragment|postfragment)/ ? ("LEXEME" , "frag" ) : ($category =~ /(^|;\s*)(mispronounced|idiosyncratic|laughter)/ ? ("LEXEME" , "un-lex" ) : ($category =~ /(^|;\s*)foreign/ ? ("LEXEME" , "for-lex" ) : ($category =~ /(^|;\s*)spokenletter/ ? ("LEXEME" , "alpha" ) : ($category =~ /(^|;\s*)acronym/ ? ("LEXEME" , "acronym" ) : ($category =~ /(^|;\s*)interjection/ ? ("LEXEME" , "interjection") : ($category =~ /(^|;\s*)propername/ ? ("LEXEME" , "propername" ) : ($category =~ /(^|;\s*)vocalnoise/ ? (($text =~ /^(laughter|laugh|laughing)/ ? ("NON-LEX" , "laugh" ) : ($text =~ /^breath/ ? ("NON-LEX" , "breath" ) : ($text =~ /^lip-?smack/ ? ("NON-LEX" , "lipsmack") : ($text =~ /^(cough|coughs|coughing)/ ? ("NON-LEX" , "cough" ) : ($text =~ /^(sneeze|sneezing)/ ? ("NON-LEX" , "sneeze" ) : ("NON-LEX" , "other"))))))) : ($category =~ /(^|;\s*)nonvocalnoise/ ? ("NON-SPEECH", "noise") : ($category =~ /(^|;\s*)nonvocalnoise/ ? ("NON-SPEECH", "music") : ("NON-LEX" , "other", warn "WARNING: unknown category ($category) for token $text\n"))))))))))))); $subtype = "un-lex" if $subtype eq "lex" and $features->{unclear}; $object{type} = $type; $object{ortho} = $features->{text} if $features->{text}; $object{stype} = $subtype; $object{name} = $features->{speakerID} if $type ne "NON-SPEECH"; $object{id} = (($type eq "LEXEME") ? ++$lexID : (($type eq "NON-LEX") ? ++$nlxID : ++$nspID)); normalize_text(\%object) if $features->{text}; return {%object}; } ################################# sub extract_mde_object { my ($mdetype, $features) = @_; my ($type, $subtype, %object); if ($mdetype eq "SU") { $type = "SUboundary"; $subtype = $features->{type}; ($subtype =~ /^(statement|question|backchannel|incomplete|clausal|coordinating|unannotated)$/) or die "\n\nFATAL ERROR: illegal SU subtype ($subtype)\n\n"; $object{id} = ++$suID; } elsif ($mdetype eq "depod") { $type = "EDIT"; $subtype = $features->{comment}; $subtype = ((not $subtype) ? "simple" : ($subtype =~ /restart/ ? "restart" : ($subtype =~ /repetition/ ? "repetition" : ($subtype =~ /revision/ ? "revision" : ($subtype =~ /complex/ ? "complex" : "simple"))))); $object{id} = ++$edtID; } elsif ($mdetype =~ /filledPause|discourseMarker|explicitEditingTerm/) { $type = "FILLER"; $subtype = ($mdetype =~ /filledPause/ ? "filled_pause" : ($mdetype =~ /discourseMarker/ ? "discourse_marker" : "explicit_editing_term")); $object{id} = ++$flrID; } elsif ($mdetype eq "aside") { $type = "A/P"; $subtype = ""; $object{id} = ++$apID; } $object{type} = $type; $object{stype} = $subtype; $object{name} = $features->{speakerID}; return {%object}; } ################################# sub add_IPs { my ($spkr, $rttm) = @_; my ($object, $ip, $ipp, $prev_edit, @ipdata, @ips); foreach $object (@$rttm) { push @ipdata, $object if $object->{name} and $object->{name} eq $spkr; } @ipdata = sort sort_ips @ipdata; foreach $object (@ipdata) { if ($object->{type} =~ /FILLER|EDIT/) { undef $ip; $ip->{type} = "IP"; $ip->{file} = $object->{file}; $ip->{chnl} = $object->{chnl}; $ip->{name} = $spkr; $ip->{id} = ++$ipID; if ($object->{type} eq "FILLER") { $ipp = $ips[@ips-1] if $prev_edit; # if ( $prev_edit and $object->{tbeg} # < $ipp->{tbeg}+$dt_MaxIPMerge ) { if ($prev_edit) { $ipp->{stype} = "edit&filler"; # $ipp->{tbeg} = ($ipp->{tbeg} + $object->{tbeg})/2; } else { $ip->{stype} = "filler"; $ip->{tbeg} = $ip->{tend} = $object->{tbeg}; $ip->{tbegValid} = $ip->{tendValid} = $object->{tbegValid}; push @ips, $ip; } } else { $ip->{stype} = "edit"; $ip->{tbeg} = $ip->{tend} = $object->{tend}; $ip->{tbegValid} = $ip->{tendValid} = $object->{tendValid}; push @ips, $ip; $prev_edit = 1; } } # elsif ($object->{type} =~ /LEXEME|NON-LEX/) { elsif ($object->{type} =~ /LEXEME|SUboundary/) { undef $prev_edit; } } push @$rttm, @ips if @ips; } ################################# sub sort_ips { return (ip_time($a) < ip_time($b)-0.0001 ? -1 : (ip_time($a) > ip_time($b)+0.0001 ? 1 : $sort_order{$a->{type}} <=> $sort_order{$b->{type}})); } ################################# sub ip_time { my ($object) = @_; return $object->{type} =~ /EDIT/ ? $object->{tend} : $object->{tbeg}; } ################################# sub add_SUs { my ($spkr, $rttm) = @_; my ($object, $su, $cb, @sudata, @sus); foreach $object (@$rttm) { next unless $object->{name}; next if $object->{name} ne $spkr; next unless $object->{type} =~/^(SUboundary|LEXEME)$/; push @sudata, $object; } @sudata = sort sort_sus @sudata; foreach $object (@sudata) { if ($object->{type} eq "LEXEME") { if ($su) { $su->{tend} = $object->{tend}; $su->{tendValid} = $object->{tendValid}; $su->{tdur} = $su->{tend} - $su->{tbeg}; } else { $su->{type} = "SU"; $su->{file} = $object->{file}; $su->{chnl} = $object->{chnl}; $su->{tbeg} = $object->{tbeg}; $su->{tbegValid} = $object->{tbegValid}; $su->{tend} = $object->{tend}; $su->{tendValid} = $object->{tendValid}; $su->{tdur} = $su->{tend} - $su->{tbeg}; $su->{name} = $spkr; $su->{id} = ++$suID; } } elsif ($object->{stype} =~ /coordinating|clausal/) { undef $cb; $cb->{type} = "CB"; $cb->{file} = $object->{file}; $cb->{chnl} = $object->{chnl}; $cb->{tbeg} = $cb->{tend} = $object->{tend}; $cb->{tbegValid} = $cb->{tendValid} = $object->{tendValid}; $cb->{stype} = $object->{stype}; $cb->{name} = $spkr; $cb->{id} = ++$suID; push @sus, $cb; $su or warn "WARNING: clausal boundary at '$cb->{tbeg}' is not inside an SU\n"; } elsif ($su) { $su->{stype} = $object->{stype}; push @sus, $su; undef $su; } else { warn "WARNING: SU from object of type $object->{type} ending at '$object->{tend}' for spkr '$spkr' contains no lexemes and will be discarded\n"; } } (not $su) or warn "WARNING: dangling LEXEMEs at end of file with no enclosing SU\n"; push @$rttm, @sus if @sus; } ################################# sub sort_sus { return 1 if $a->{type} eq "SUboundary" and $a->{tend} > $b->{tend}-0.0001; return -1 if $b->{type} eq "SUboundary" and $b->{tend} > $a->{tend}-0.0001; return 1 if $a->{tbeg} > $b->{tend}-0.0001; return -1 if $b->{tbeg} > $a->{tend}-0.0001; return 0; } ################################# sub sort_uems { return 1 if $a->{tbeg} > $b->{tbeg}; return -1 if $b->{tbeg} > $a->{tbeg}; return 0; } ################################# sub generate_scoring_uems { my ($rttm, $spkr_uemsegs, $disfl_uemsegs, $channel) = @_; my ($object, $spkr_uemseg, $disfl_uemseg, $tendPrevLex); my ($tendCurrSpkrSeg, $tendCurrDisflSeg); my ($scoringSuppressed, $suppressAtLeastUntil); # due to NON-LEX $tendPrevLex = 0.0; # default value $tendCurrSpkrSeg = 0.0; # default value $tendCurrDisflSeg = 0.0; # default value $scoringSuppressed = 0; # i.e., false $suppressAtLeastUntil = 0.0; # default value undef $begNoscore; undef $endNoscore; foreach $object (@$rttm) { next unless $channel == $object->{chnl}; # Generate uem segments if ( $object->{type} eq "NOSCORE" or $object->{type} eq "NO_RT_METADATA" or ($object->{type} eq "SU" and $object->{stype} eq "unannotated") or ($object->{type} eq "LEXEME" and $object->{stype} eq "un-lex") or ( $object->{type} eq "SEGMENT" and $object->{name} eq "unknown" and $speaker_data{$object->{name}}{stype} eq "unknown" ) ) { if (defined $endNoscore) { if ($object->{tbeg} > $endNoscore) { $begNoscore = $object->{tbeg}; } } else { $begNoscore = $object->{tbeg}; } if (defined $endNoscore) { if ($endNoscore < $object->{tend}) { $endNoscore = $object->{tend}; } } else { $endNoscore = $object->{tend}; } if ($spkr_uemseg) { if ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { $spkr_uemseg->{tend} = $begNoscore; if (defined $spkr_uemseg->{tbeg}) { push @$spkr_uemsegs, $spkr_uemseg; } undef $spkr_uemseg; } elsif ( $spkr_uemseg->{tend} > $endNoscore and $spkr_uemseg->{tbeg} > $begNoscore and $spkr_uemseg->{tbeg} <= $endNoscore ) { $spkr_uemseg->{tbeg} = $endNoscore; } elsif ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $endNoscore ) { my $savdEnd = $spkr_uemseg->{tend}; my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; $spkr_uemseg->{tend} = $begNoscore; if (defined $spkr_uemseg->{tbeg}) { push @$spkr_uemsegs, $spkr_uemseg; } undef $spkr_uemseg; $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endNoscore; $spkr_uemseg->{tend} = $savdEnd; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ($spkr_uemseg->{tend} <= $begNoscore) { if (defined $spkr_uemseg->{tbeg}) { push @$spkr_uemsegs, $spkr_uemseg; } undef $spkr_uemseg; } } if ($disfl_uemseg) { if ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $begNoscore and $disfl_uemseg->{tend} <= $endNoscore ) { $disfl_uemseg->{tend} = $begNoscore; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; } elsif ( $disfl_uemseg->{tend} > $endNoscore and $disfl_uemseg->{tbeg} > $begNoscore and $disfl_uemseg->{tbeg} <= $endNoscore ) { $disfl_uemseg->{tbeg} = $endNoscore; } elsif ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $endNoscore ) { my $savdEnd = $disfl_uemseg->{tend}; my $savdFile = $disfl_uemseg->{file}; my $savdChnl = $disfl_uemseg->{chnl}; $disfl_uemseg->{tend} = $begNoscore; if (defined $disfl_uemseg->{tbeg}) { push @$disfl_uemsegs, $disfl_uemseg; } undef $disfl_uemseg; $disfl_uemseg->{file} = $savdFile; $disfl_uemseg->{chnl} = $savdChnl; $disfl_uemseg->{tbeg} = $endNoscore; $disfl_uemseg->{tend} = $savdEnd; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } elsif ($disfl_uemseg->{tend} <= $begNoscore) { if (defined $disfl_uemseg->{tbeg}) { push @$disfl_uemsegs, $disfl_uemseg; } undef $disfl_uemseg; } } } elsif ($object->{type} =~ /SEGMENT|LEXEME|NON-LEX|NON-SPEECH/) { if ($spkr_uemseg) { if ($object->{type} =~ /NON-LEX|NON-SPEECH/) { if (not defined $spkr_uemseg->{tbeg}) { $suppressAtLeastUntil = $object->{tend}; } else { if ($tendPrevLex > $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tend} = $tendPrevLex; $spkr_uemseg->{tdur} = $tendPrevLex - $spkr_uemseg->{tbeg}; } if ($spkr_uemseg->{tbeg} <= $object->{tbeg}) { if ($spkr_uemseg->{tbeg} < $object->{tbeg}) { if ($spkr_uemseg->{tend} > $object->{tbeg}) { if ($spkr_uemseg->{tend} > $object->{tend}) { my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; my $savdTend = $spkr_uemseg->{tend}; $spkr_uemseg->{tend} = $object->{tbeg}; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $object->{tend}; $spkr_uemseg->{tend} = $savdTend; } else { $spkr_uemseg->{tend} = $object->{tbeg}; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; } } else { push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; } } elsif ($spkr_uemseg->{tend} > $object->{tend}) { $spkr_uemseg->{tbeg} = $object->{tend}; } else { undef $spkr_uemseg; } $scoringSuppressed = 1; # i.e., true $suppressAtLeastUntil = $object->{tend}; } } } elsif ($object->{type} eq "SEGMENT") { ## Note that this is a SEGMENT that did not match ## the initial "if" -- see the NOSCORE and NO_RTMETADATA ## logic above if ($tendCurrSpkrSeg > $object->{tbeg}) { # we have an overlap # Note that "overlap" is defined by the SEGMENTs if ($spkr_uemseg->{tbeg} < $object->{tbeg}) { if ($opt_x) { # exclude overlap region from the uem file my $endOverlap = $spkr_uemseg->{tend}; my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; $spkr_uemseg->{tend} = $object->{tbeg}; #start of overlap $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; if (defined $endNoscore) { if ($object->{tend} > $endNoscore) { $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endOverlap > $endNoscore ? $endOverlap : $endNoscore; $spkr_uemseg->{tend} = $object->{tend}; } #else the segment ends within noscore region # ... and we leave spkr_uemseg undef'd } else { $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endOverlap; $spkr_uemseg->{tend} = $object->{tend}; $spkr_uemseg->{tdur} = $object->{tend} - $endOverlap; } } else { # keep overlap region in the uem file if (defined $begNoscore and defined $endNoscore) { if ($object->{tend} > $endNoscore) { $spkr_uemseg->{file} = $object->{file}; $spkr_uemseg->{chnl} = $object->{chnl}; if ($begNoscore < $object->{tbeg}) { if ($object->{tbeg} <= $endNoscore) { $spkr_uemseg->{tbeg} = $endNoscore; } } if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg}; } $spkr_uemseg->{tend} = $object->{tend}; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ($object->{tbeg} < $begNoscore) { if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg} } $spkr_uemseg->{tend} = $begNoscore; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } else { undef $spkr_uemseg; # all no-scored next; } } else { if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg} } if ($object->{tend} > $tendCurrSpkrSeg) { $tendCurrSpkrSeg = $object->{tend}; } $spkr_uemseg->{tend} = $tendCurrSpkrSeg; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } } } } else { # not an overlap situation if (defined $begNoscore and defined $endNoscore) { if ($object->{tend} > $endNoscore) { $spkr_uemseg->{file} = $object->{file}; $spkr_uemseg->{chnl} = $object->{chnl}; if ($begNoscore < $object->{tbeg}) { if ($object->{tbeg} <= $endNoscore) { $spkr_uemseg->{tbeg} = $endNoscore; } } if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg}; } $spkr_uemseg->{tend} = $object->{tend}; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ($object->{tbeg} < $begNoscore) { if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg} } $spkr_uemseg->{tend} = $begNoscore; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } else { undef $spkr_uemseg; # all no-scored next; } } else { if (not defined $spkr_uemseg->{tbeg}) { $spkr_uemseg->{tbeg} = $object->{tbeg} } if ($object->{tend} > $tendCurrSpkrSeg) { $tendCurrSpkrSeg = $object->{tend}; } $spkr_uemseg->{tend} = $tendCurrSpkrSeg; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } } if ($tendCurrSpkrSeg < $object->{tend}) { $tendCurrSpkrSeg = $object->{tend}; } if (defined $begNoscore and defined $endNoscore) { if ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; } elsif ( $spkr_uemseg->{tend} > $endNoscore and $spkr_uemseg->{tbeg} > $begNoscore and $spkr_uemseg->{tbeg} < $endNoscore ) { $spkr_uemseg->{tbeg} = $endNoscore; } elsif ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $endNoscore ) { my $savdEnd = $spkr_uemseg->{tend}; my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endNoscore; $spkr_uemseg->{tend} = $savdEnd; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ( $spkr_uemseg->{tbeg} >= $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { undef $spkr_uemseg; } } if ($scoringSuppressed == 1) { if ((not defined $begNoscore and not defined $endNoscore) or $object->{tbeg} >= $endNoscore or $object->{tend} <= $begNoscore) { if ($object->{tend} > $suppressAtLeastUntil) { if ($object->{tbeg} >= $suppressAtLeastUntil) { $spkr_uemseg->{tbeg} = $object->{tbeg}; } else { $spkr_uemseg->{tbeg} = $suppressAtLeastUntil; } $scoringSuppressed = 0; # i.e., false } } } } if ($object->{type} =~ /LEXEME/) { $tendPrevLex = $object->{tend}; if ($scoringSuppressed == 1) { if ((not defined $begNoscore and not defined $endNoscore) or $object->{tbeg} >= $endNoscore or $object->{tend} <= $begNoscore) { if ($object->{tend} > $suppressAtLeastUntil) { if ($object->{tbeg} >= $suppressAtLeastUntil) { $spkr_uemseg->{tbeg} = $object->{tbeg}; } else { $spkr_uemseg->{tbeg} = $suppressAtLeastUntil; } $scoringSuppressed = 0; # i.e., false } } } } } if ($disfl_uemseg) { if ($object->{type} eq "SEGMENT") { if ($tendCurrDisflSeg > $object->{tbeg}) { # we have an overlap # Note that "overlap" is defined by the SEGMENTs if ($disfl_uemseg->{tbeg} < $object->{tbeg}) { if ($opt_x) { # exclude overlap region from the uem file my $endOverlap = $disfl_uemseg->{tend}; my $savdFile = $disfl_uemseg->{file}; my $savdChnl = $disfl_uemseg->{chnl}; $disfl_uemseg->{tend} = $object->{tbeg}; #start of overlap $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; $disfl_uemseg->{file} = $savdFile; $disfl_uemseg->{chnl} = $savdChnl; $disfl_uemseg->{tbeg} = $endOverlap; $disfl_uemseg->{tend} = $object->{tend}; $disfl_uemseg->{tdur} = $object->{tend} - $endOverlap; } else { # keep overlap region in the uem file if (defined $begNoscore and defined $endNoscore) { if ($object->{tend} > $endNoscore) { $disfl_uemseg->{file} = $object->{file}; $disfl_uemseg->{chnl} = $object->{chnl}; if ($begNoscore < $object->{tbeg}) { if ($object->{tbeg} <= $endNoscore) { $disfl_uemseg->{tbeg} = $endNoscore; } } if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg}; } $disfl_uemseg->{tend} = $object->{tend}; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } elsif ($object->{tbeg} < $begNoscore) { if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg} } $disfl_uemseg->{tend} = $begNoscore; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $spkr_uemseg->{tbeg}; } else { undef $disfl_uemseg; # all no-scored next; } } else { if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg} } if ($object->{tend} > $tendCurrDisflSeg) { $tendCurrDisflSeg = $object->{tend}; } $disfl_uemseg->{tend} = $tendCurrDisflSeg; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } } } } else { # not an overlap situation if (defined $begNoscore and defined $endNoscore) { if ($object->{tend} > $endNoscore) { $disfl_uemseg->{file} = $object->{file}; $disfl_uemseg->{chnl} = $object->{chnl}; if ($begNoscore < $object->{tbeg}) { if ($object->{tbeg} <= $endNoscore) { $disfl_uemseg->{tbeg} = $endNoscore; } } if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg}; } $disfl_uemseg->{tend} = $object->{tend}; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } elsif ($object->{tbeg} < $begNoscore) { if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg} } $disfl_uemseg->{tend} = $begNoscore; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $spkr_uemseg->{tbeg}; } else { undef $disfl_uemseg; # all no-scored next; } } else { if (not defined $disfl_uemseg->{tbeg}) { $disfl_uemseg->{tbeg} = $object->{tbeg} } if ($object->{tend} > $tendCurrDisflSeg) { $tendCurrDisflSeg = $object->{tend}; } $disfl_uemseg->{tend} = $tendCurrDisflSeg; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } } if ($tendCurrDisflSeg < $object->{tend}) { $tendCurrDisflSeg = $object->{tend}; } if (defined $begNoscore and defined $endNoscore) { if ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $begNoscore and $disfl_uemseg->{tend} <= $endNoscore ) { $disfl_uemseg->{tend} = $begNoscore; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; } elsif ( $disfl_uemseg->{tend} > $endNoscore and $disfl_uemseg->{tbeg} > $begNoscore and $disfl_uemseg->{tbeg} < $endNoscore ) { $disfl_uemseg->{tbeg} = $endNoscore; } elsif ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $endNoscore ) { my $savdEnd = $disfl_uemseg->{tend}; my $savdFile = $disfl_uemseg->{file}; my $savdChnl = $disfl_uemseg->{chnl}; $disfl_uemseg->{tend} = $begNoscore; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; $disfl_uemseg->{file} = $savdFile; $disfl_uemseg->{chnl} = $savdChnl; $disfl_uemseg->{tbeg} = $endNoscore; $disfl_uemseg->{tend} = $savdEnd; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } elsif ( $disfl_uemseg->{tbeg} >= $begNoscore and $disfl_uemseg->{tend} <= $endNoscore ) { undef $disfl_uemseg; } } } } if (not $disfl_uemseg and $object->{type} =~ /SEGMENT/) { if (not (defined $begNoscore and defined $endNoscore and $object->{tbeg} >= $begNoscore and $object->{tend} <= $endNoscore)) { $disfl_uemseg->{file} = $object->{file}; $disfl_uemseg->{chnl} = $object->{chnl}; if ($tendCurrDisflSeg < $object->{tend}) { $tendCurrDisflSeg = $object->{tend}; } if (defined $begNoscore and defined $endNoscore) { if ($object->{tbeg} < $begNoscore and $object->{tend} > $begNoscore) { $disfl_uemseg->{tend} = $begNoscore; } else { $disfl_uemseg->{tend} = $object->{tend}; } if ($object->{tend} > $endNoscore and $object->{tbeg} < $endNoscore) { $disfl_uemseg->{tbeg} = $endNoscore; } else { $disfl_uemseg->{tbeg} = $object->{tbeg}; } } else { $disfl_uemseg->{tbeg} = $object->{tbeg}; $disfl_uemseg->{tend} = $object->{tend}; } $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; if (defined $begNoscore and defined $endNoscore) { if ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $begNoscore and $disfl_uemseg->{tend} <= $endNoscore ) { $disfl_uemseg->{tend} = $begNoscore; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; } elsif ( $disfl_uemseg->{tend} > $endNoscore and $disfl_uemseg->{tbeg} > $begNoscore and $disfl_uemseg->{tbeg} < $endNoscore ) { $disfl_uemseg->{tbeg} = $endNoscore; } elsif ( $disfl_uemseg->{tbeg} < $begNoscore and $disfl_uemseg->{tend} > $endNoscore ) { my $savdEnd = $disfl_uemseg->{tend}; my $savdFile = $disfl_uemseg->{file}; my $savdChnl = $disfl_uemseg->{chnl}; $disfl_uemseg->{tend} = $begNoscore; push @$disfl_uemsegs, $disfl_uemseg; undef $disfl_uemseg; $disfl_uemseg->{file} = $savdFile; $disfl_uemseg->{chnl} = $savdChnl; $disfl_uemseg->{tbeg} = $endNoscore; $disfl_uemseg->{tend} = $savdEnd; $disfl_uemseg->{tdur} = $disfl_uemseg->{tend} - $disfl_uemseg->{tbeg}; } elsif ( $disfl_uemseg->{tbeg} >= $begNoscore and $disfl_uemseg->{tend} <= $endNoscore ) { undef $disfl_uemseg; } } } } ## END if not $disfl_uemseg if (not $spkr_uemseg and ($object->{type} =~ /SEGMENT|LEXEME/)) { $spkr_uemseg->{file} = $object->{file}; $spkr_uemseg->{chnl} = $object->{chnl}; if ($object->{type} eq "SEGMENT") { if (not (defined $begNoscore and defined $endNoscore and $object->{tbeg} >= $begNoscore and $object->{tend} <= $endNoscore)) { if ($tendCurrSpkrSeg < $object->{tend}) { $tendCurrSpkrSeg = $object->{tend}; } if (defined $begNoscore and defined $endNoscore) { if ($object->{tbeg} < $begNoscore and $object->{tend} > $begNoscore) { $spkr_uemseg->{tend} = $begNoscore; } else { $spkr_uemseg->{tend} = $object->{tend}; } if ($object->{tend} > $endNoscore and $object->{tbeg} < $endNoscore) { $spkr_uemseg->{tbeg} = $endNoscore; } else { $spkr_uemseg->{tbeg} = $object->{tbeg}; } } else { $spkr_uemseg->{tbeg} = $object->{tbeg}; $spkr_uemseg->{tend} = $object->{tend}; } $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; if ($scoringSuppressed == 0) { if (defined $begNoscore and defined $endNoscore) { if ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; } elsif ( $spkr_uemseg->{tend} > $endNoscore and $spkr_uemseg->{tbeg} > $begNoscore and $spkr_uemseg->{tbeg} < $endNoscore ) { $spkr_uemseg->{tbeg} = $endNoscore; } elsif ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $endNoscore ) { my $savdEnd = $spkr_uemseg->{tend}; my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endNoscore; $spkr_uemseg->{tend} = $savdEnd; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ( $spkr_uemseg->{tbeg} >= $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { undef $spkr_uemseg; } } } #else, since scoringSuppressed, # $spkr_uemseg->{tbeg} to be filled in when # see first LEXEME token in segment } else { undef $spkr_uemseg; } } elsif ($object->{type} =~ /LEXEME/ and $object->{tend} <= $tendCurrSpkrSeg) { if ($object->{tend} > $suppressAtLeastUntil ) { $tendPrevLex = $object->{tend}; if (defined $begNoscore and defined $endNoscore) { if ($object->{tbeg} < $begNoscore and $tendCurrSpkrSeg > $begNoscore) { $spkr_uemseg->{tend} = $begNoscore; } else { $spkr_uemseg->{tend} = $tendCurrSpkrSeg; } } else { $spkr_uemseg->{tend} = $tendCurrSpkrSeg; } if ($scoringSuppressed == 0) { if (defined $begNoscore and defined $endNoscore) { if ($object->{tend} > $endNoscore and $object->{tbeg} < $endNoscore) { $spkr_uemseg->{tbeg} = $endNoscore; } else { $spkr_uemseg->{tbeg} = $object->{tbeg}; } } else { $spkr_uemseg->{tbeg} = $object->{tbeg}; } $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } else { # scoringSuppressed == 1 if ($object->{tbeg} >= $suppressAtLeastUntil) { $spkr_uemseg->{tbeg} = $object->{tbeg}; } else { $spkr_uemseg->{tbeg} = $suppressAtLeastUntil; } $scoringSuppressed = 0; # i.e., false if (defined $begNoscore and defined $endNoscore) { if ( $spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; } elsif ( $spkr_uemseg->{tend} > $endNoscore and $spkr_uemseg->{tbeg} > $begNoscore and $spkr_uemseg->{tbeg} < $endNoscore ) { $spkr_uemseg->{tbeg} = $endNoscore; } elsif ($spkr_uemseg->{tbeg} < $begNoscore and $spkr_uemseg->{tend} > $endNoscore ) { my $savdEnd = $spkr_uemseg->{tend}; my $savdFile = $spkr_uemseg->{file}; my $savdChnl = $spkr_uemseg->{chnl}; $spkr_uemseg->{tend} = $begNoscore; push @$spkr_uemsegs, $spkr_uemseg; undef $spkr_uemseg; $spkr_uemseg->{file} = $savdFile; $spkr_uemseg->{chnl} = $savdChnl; $spkr_uemseg->{tbeg} = $endNoscore; $spkr_uemseg->{tend} = $savdEnd; $spkr_uemseg->{tdur} = $spkr_uemseg->{tend} - $spkr_uemseg->{tbeg}; } elsif ( $spkr_uemseg->{tbeg} >= $begNoscore and $spkr_uemseg->{tend} <= $endNoscore ) { undef $spkr_uemseg; } } } } elsif ($object->{type} eq "LEXEME" and $object->{tend} <= $suppressAtLeastUntil) { undef $spkr_uemseg; } } else { undef $spkr_uemseg; } } ## END if not $spkr_uemseg if ( $tendPrevLex == 0.0 and $object->{type} =~ /LEXEME/ and ( (defined $begNoscore and $object->{tbeg} < $begNoscore) or (defined $endNoscore and $object->{tend} > $endNoscore))) { ## this code is not expected to be executed print "**** Error **** tendPrevLex==0.0 at end of generate_spkr_scoring_uem()\n"; $tendPrevLex = $object->{tend}; } } } push @$spkr_uemsegs, $spkr_uemseg if $spkr_uemseg; push @$disfl_uemsegs, $disfl_uemseg if $disfl_uemseg; } ################################# sub add_speaker_diarization { my ($spkr, $rttm) = @_; my ($object, $spkrseg, @spkrsegs); foreach $object (@$rttm) { ## Do spkr segs next unless $object->{name}; next if ($object->{name} ne $spkr); next unless $object->{type} =~ /LEXEME|NON-LEX/; if ($spkrseg) { if ($object->{tbeg} > $spkrseg->{tend}+$dt_MaxSpkrMerge) { push @spkrsegs, $spkrseg; undef $spkrseg; } else { $spkrseg->{tend} = $object->{tend}; $spkrseg->{tendValid} = $object->{tendValid}; $spkrseg->{tdur} = $spkrseg->{tend} - $spkrseg->{tbeg}; } } if (not $spkrseg) { $spkrseg->{type} = "SPEAKER"; $spkrseg->{id} = ++$spkrID; $spkrseg->{file} = $object->{file}; $spkrseg->{chnl} = $object->{chnl}; $spkrseg->{tbeg} = $object->{tbeg}; $spkrseg->{tbegValid} = $object->{tbegValid}; $spkrseg->{tend} = $object->{tend}; $spkrseg->{tendValid} = $object->{tendValid}; $spkrseg->{tdur} = $object->{tend} - $object->{tbeg}; $spkrseg->{name} = $spkr; } } push @spkrsegs, $spkrseg if $spkrseg; push @$rttm, @spkrsegs if @spkrsegs; } ################################# sub output_rttm_object_file { my ($rttm, $file) = @_; my ($object, $name); open DATA, ">$file" or die "\n\nFATAL ERROR: unable to open RTTM object output file '$file'$usage"; print DATA ";;This is an RTTM file. Each record contains 9 whitespace separated fields:\n"; print DATA ";; 1:type 2:file 3:chnl 4:tbeg 5:tdur 6:ortho 7:subtype 8:spkrname 9:conf\n"; foreach $name (sort keys %speaker_data) { print_object ($speaker_data{$name}); } @$rttm = sort sort_objects @$rttm; foreach $object (@$rttm) { print_object($object) if $object->{type} =~ /^(NO_RT_METADATA|SEGMENT|LEXEME|NON-LEX|NON-SPEECH|FILLER|EDIT|IP|SU|CB|A\/P|SPEAKER)$/; print_object($object) if ($opt_n and $object->{type} eq "NOSCORE"); } close DATA; } ################################# sub print_object { my ($object) = @_; printf DATA "%-12s %-12s %3s", $object->{type}, $object->{file}, $object->{chnl}; defined $object->{tbeg} ? (printf DATA " %8.3f%s", $object->{tbeg} , ($object->{tbegValid} ? " ":"*")) : (printf DATA " %9s", " "); defined $object->{tdur} ? (printf DATA " %6.3f%s", $object->{tdur} , ($object->{tendValid} ? " ":"*")) : (printf DATA " %7s", " "); printf DATA " %-16s %-20s %-20s %s\n", ($object->{ortho} ? $object->{ortho} : ""), ($object->{stype} ? $object->{stype} : ""), ($object->{name} ? $object->{name} : ""), ($object->{features}{conf} ? $object->{features}{conf} : ""); } ################################# sub sort_objects { return ($a->{tbeg} < $b->{tbeg}-0.0001 ? -1 : ($a->{tbeg} > $b->{tbeg}+0.0001 ? 1 : (($a->{type} =~ /end/ and $b->{type} =~ /beg/) ? -1 : (($a->{type} =~ /beg/ and $b->{type} =~ /end/) ? 1 : $sort_order{$a->{type}} <=> $sort_order{$b->{type}})))); } ################################# sub sort_events { return ($a->{time} < $b->{time}-0.0001 ? -1 : ($b->{time} < $a->{time}-0.0001 ? 1 : (($a->{object} eq $b->{object}) ? ($a->{type} =~ /beg/ ? -1 : 1) : ($a->{type} =~ /beg/ ? ($b->{type} =~ /end/ ? 1 : $sort_order{$a->{object}{type}} <=> $sort_order{$b->{object}{type}}) : ($a->{type} =~ /end/ ? ($b->{type} =~ /beg/ ? -1 : $sort_order{$b->{object}{type}} <=> $sort_order{$a->{object}{type}}) : ($b->{type} =~ /end/ ? $sort_order{$b->{object}{type}} <=> $sort_order{$a->{object}{type}} : $sort_order{$a->{object}{type}} <=> $sort_order{$b->{object}{type}})))))); } ################################# sub extract_sgml_span { my ($name, $data) = @_; return () unless defined $name and defined $data; return () unless ($data =~ m|<$name(\s+[^>]*)*>(.*)|si); my $tag = $1; my $remainder = $2; return (undef, $remainder) if ($tag and $tag =~ m|/$|); ($data =~ m|<$name(\s+[^>]*)*>(.*?)(.*)|si) ? ($2, $3) : (); } ################################# sub extract_sgml_tag_and_span { my ($name, $data) = @_; return () unless defined $name and defined $data; return () unless ($data =~ m|<$name(\s+[^>]*)*>(.*)|si); my $tag = $1; my $remainder = $2; return ($tag, undef, $remainder) if ($tag and $tag =~ m|/$|); ($data =~ m|<$name(\s+[^>]*)*>(.*?)(.*)|si) ? ($1, $2, $3) : (); } ################################# sub extract_sgml_tag_attribute { my ($name, $data) = @_; return () unless defined $name and defined $data; ($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : (); } ################################# sub date_time_stamp { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my ($date, $time); $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; return ($date, $time); } ################################# sub normalize_text { my ($object) = @_; if ($object->{ortho}) { # add more XML symbolic patterns as needed # $object->{ortho} =~ s/&/&/; # make alpha token to have a '.' (period) following it # if there isn't one already # if ($object->{stype} eq "alpha" and $object->{ortho} !~ /[a-zA-Z]\./) { $object->{ortho} =~ s/([a-zA-Z])/$1./; } # make token with preceeding '^' (caret) to have lex class of # propername # if ($object->{stype} eq "lex") { if ($object->{ortho} =~ /\^/) { $object->{ortho} =~ s/\^//; $object->{stype} = "propernoun"; } } } } ################################# sub normalize_swb1_text { my ($annotations) = @_; my ($id); foreach $id (keys %{$annotations}) { if ($annotations->{$id}{features}{text}) { # take out the word completion of a fragmented word # i.e. 'sh[e]-' as 'sh' with category as postfragment # '-[de]rived' as 'rived' with category as prefragment # if ($annotations->{$id}{features}{text} =~ /([-a-z.]+)\[.*?\]-/i) { $annotations->{$id}{features}{text} = $1; $annotations->{$id}{features}{category} = $annotations->{$id}{features}{category}."postFragment;"; } elsif ($annotations->{$id}{features}{text} =~ /-\[.*?\]([-a-z.]+)/i) { $annotations->{$id}{features}{text} = $1; $annotations->{$id}{features}{category} = $annotations->{$id}{features}{category}."preFragment;"; } # replace variant pronunciation words with intended word # i.e. 'because_1' as 'because' # $annotations->{$id}{features}{text} =~ s/_\d+//; } } }