#!/usr/bin/perl -w use strict; ################################# # History: # # version 10 # * require min_overlap of names in promote_entity_mention_to_mention_entity # * give 50% discount to precision and recall for attribute recognition errors # * trim leading/trailing whitespace from tag attributes # * relation type/subtype "OTHER-AFF"/"Other" added to symmetric list # * bug fix - always treat multi-line input strings as single string in regexp ops # * both TYPE and SUBTYPE must match for an entity to be "correct" # # version 09 # * bug fix in calculation of relation score # * addition of relation mention evaluation # * value formula for TERN evaluation changed and simplified # # version 08 # * bug fix in print_timex2_mapping (some attribute errors weren't detected) # * Recall/Precision/F-measure added to conditional error analysis # * TERN evaluation restricted to timex type = "TIMEX2" # * Change in timex2 value parameters # * in print_timex2_mapping: # - add attributes to printout of false alarms # - limit attribute listing to those being evaluated # - eliminate NON_SPECIFIC from the list of evaluated attributes # * added subtype "other" to entities of type LOCATION # * bug fix in extract_sgml_tag_attribute # * change timex2 value formula to weight value according to attributes # # version 07 # * bug fix in span_overlap # * timex2 evaluation added # * printout suppressed for entities/relations/events/timex2 # whenever there is no reference data for them # # version 06 # * scoring upgrade to match ace04-evalplan-v2 (May 22, 2004) # * input format conversion from ALF to APF # * LEXICALCONDITION made optional in relation_mention # * bug fix - eliminate demand for extent in get_relation_argument # # version 05 # * parameter sets contain modified mention coreference error weight # # version 04 # * bug fix in calculating entity FA and relation argument FA values # # version 03 # * Added evaluation of the entity mention task: to detect mentions # separate from entity assignment and associated coreference issues. # * Mention false alarms are deweighted when they exist in the reference # as valid mentions (i.e., deweight for "mere" coreference errors). # * Upgraded relation and event scoring to comprehend document-level # (cross-document) scoring, as is done for entities. # # version 02 # * Mapping speed-up by grouping entities/relations/events to be mapped # into minimum sized cohort sets and mapping on these smaller sets. # * Bug in mapping algorithm fixed. # # version 01 # This utility provides evaluation functionality for the ACE program. # * Derives from ace-eval-v03. # * Upgraded to include ACE events. # * Converted to read data from the new DTD (v3.0.2). # * Uses new scoring formulas # * Provides scoring mode control - selection of different sets # of scoring parameters # ################################# # SCORING PARAMETERS: # min_overlap is the minimum mutual fractional overlap allowed # for a mention head or a name to be declared as successfully detected. use vars qw ($min_overlap); $min_overlap = 0.3; #minimum fractional overlap for mention detection # max_diff is the maximum extent difference allowed for names and #mentions to be declared a "match". use vars qw ($max_diff_chars $max_diff_time $max_diff_xy); # max_diff_chars is the maximum extent difference # in characters for text sources. $max_diff_chars = 4; # max_diff_time is the maximum extent difference # in seconds for audio sources. $max_diff_time = 0.4; # max_diff_xy is the maximum extent difference # in centimeters for image mentions. $max_diff_xy = 0.4; my (%entity_err_wgt, $entity_fa_wgt); my (%entity_mention_err_wgt, $entity_mention_fa_wgt, $entity_mention_ref_fa_wgt); my (%relation_err_wgt, $relation_fa_wgt, $relation_argument_threshold); my (%event_err_wgt, $event_fa_wgt); my ($event_participant_role_err_wgt, $event_participant_fa_wgt, $event_participant_threshold); my ($timex2_fa_wgt); my %parameter_set = (MINSCORE => {entity_err_wgt => {TYPE => 0.00, SUBTYPE => 0.00, CLASS => 0.00}, entity_fa_wgt => 1.00, entity_mention_err_wgt => {TYPE => 0.00, ROLE => 0.00, STYLE => 0.00}, entity_mention_fa_wgt => 1.00, entity_mention_ref_fa_wgt => 1.00, relation_err_wgt => {TYPE => 0.00, SUBTYPE => 0.00}, relation_fa_wgt => 1.00, relation_argument_threshold => 0.00, event_err_wgt => {TYPE => 0.00, MODALITY => 0.00}, event_fa_wgt => 1.00, event_participant_role_err_wgt => 0.00, event_participant_fa_wgt => 1.00, event_participant_threshold => 0.75, timex2_fa_wgt => 1.00, }, HARD => {entity_err_wgt => {TYPE => 0.00, SUBTYPE => 0.50, CLASS => 0.50}, entity_fa_wgt => 1.00, entity_mention_err_wgt => {TYPE => 0.50, ROLE => 0.50, STYLE => 0.50}, entity_mention_fa_wgt => 1.00, entity_mention_ref_fa_wgt => 0.50, relation_err_wgt => {TYPE => 0.00, SUBTYPE => 0.50}, relation_fa_wgt => 1.00, relation_argument_threshold => 0.00, event_err_wgt => {TYPE => 0.00, MODALITY => 0.50}, event_fa_wgt => 1.00, event_participant_role_err_wgt => 0.00, event_participant_fa_wgt => 1.00, event_participant_threshold => 0.50, timex2_fa_wgt => 1.00, }, DEFAULT => {entity_err_wgt => {TYPE => 0.50, SUBTYPE => 0.90, CLASS => 0.75}, entity_fa_wgt => 0.75, entity_mention_err_wgt => {TYPE => 0.90, ROLE => 0.90, STYLE => 0.90}, entity_mention_fa_wgt => 0.75, entity_mention_ref_fa_wgt => 0.00, relation_err_wgt => {TYPE => 0.50, SUBTYPE => 0.90}, relation_fa_wgt => 0.75, relation_argument_threshold => 0.00, event_err_wgt => {TYPE => 0.50, MODALITY => 0.75}, event_fa_wgt => 0.75, event_participant_role_err_wgt => 0.50, event_participant_fa_wgt => 0.75, event_participant_threshold => 0.25, timex2_fa_wgt => 0.75, }, EASY => {entity_err_wgt => {TYPE => 0.75, SUBTYPE => 1.00, CLASS => 1.00}, entity_fa_wgt => 0.50, entity_mention_err_wgt => {TYPE => 1.00, ROLE => 1.00, STYLE => 1.00}, entity_mention_fa_wgt => 0.50, entity_mention_ref_fa_wgt => 0.00, relation_err_wgt => {TYPE => 0.75, SUBTYPE => 1.00}, relation_fa_wgt => 0.50, relation_argument_threshold => 0.00, event_err_wgt => {TYPE => 0.75, MODALITY => 1.00}, event_fa_wgt => 0.50, event_participant_role_err_wgt => 0.75, event_participant_fa_wgt => 0.50, event_participant_threshold => 0.00, timex2_fa_wgt => 0.50, }, MAXSCORE => {entity_err_wgt => {TYPE => 1.00, SUBTYPE => 1.00, CLASS => 1.00}, entity_fa_wgt => 0.00, entity_mention_err_wgt => {TYPE => 1.00, ROLE => 1.00, STYLE => 1.00}, entity_mention_fa_wgt => 0.00, entity_mention_ref_fa_wgt => 0.00, relation_err_wgt => {TYPE => 1.00, SUBTYPE => 1.00}, relation_fa_wgt => 0.00, relation_argument_threshold => 0.00, event_err_wgt => {TYPE => 1.00, MODALITY => 1.00}, event_fa_wgt => 0.00, event_participant_role_err_wgt => 1.00, event_participant_fa_wgt => 0.00, event_participant_threshold => 0.00, timex2_fa_wgt => 0.00, }, ); #Entity parameters my %normalize_mention_type = (NAM => "NAME", NOM => "NOMINAL", PRE => "PREMODIFIER", PRO => "PRONOUN"); my %mention_type_wgt = (NAME => 1.00, NOMINAL => 0.20, PREMODIFIER => 0.20, PRONOUN => 0.04); my %normalize_entity_type = (PER => "PERSON", ORG => "ORGANIZATION", VEH => "VEHICLE", WEA => "WEAPON", GPE => "GPE", GSP => "GPE", LOC => "LOCATION", FAC => "FACILITY", TMP => "TMP"); my %entity_type_wgt = (PERSON => 1.00, ORGANIZATION => 0.50, VEHICLE => 0.50, WEAPON => 0.50, GPE => 0.25, LOCATION => 0.10, FACILITY => 0.05, TMP => 0.05); my %normalize_entity_class = (SPE => "SPECIFIC", SPC => "SPECIFIC", GEN => "GENERIC", NEG => "NEGATIVE", UND => "UNDER-SPECIFIED", USP => "UNDER-SPECIFIED"); my %entity_class_wgt = (SPECIFIC => 1.00, GENERIC => 0.00, NEGATIVE => 0.00, "UNDER-SPECIFIED" => 0.00); #Relation parameters my %relation_type_wgt = (PHYS => 1.00, "PER-SOC" => 1.00, "EMP-ORG" => 1.00, ART => 1.00, "OTHER-AFF" => 1.00, "GPE-AFF" => 1.00, DISC => 1.00, METONYMY => 1.00); #Event parameters my %normalize_event_type = (BRK => "destroy", MAK => "create", GIV => "transfer", MOV => "move", INT => "interact", OTH => "other"); my %event_type_wgt = (destroy => 1.00, create => 1.00, transfer => 1.00, move => 1.00, interact => 1.00, other => 1.00); my %normalize_event_modality = (Real => "Real", NotReal => "NotReal"); my %event_modality_wgt = (Real => 1.00, NotReal => 1.00); #Timex2 parameters my $timex2_detection_wgt = 0.10; my %timex2_attribute_wgt = (ANCHOR_DIR => 0.25, ANCHOR_VAL => 0.50, MOD => 0.10, SET => 0.10, VAL => 1.00); #Entities are mapped subject to the following constraints: # * Each system output entity may map to only one reference entity. # * Each reference entity may map to at most one system output entity. # * An entity is mapped only if it improves the overall entity score. # * The mappings are chosen to maximize the overall entity score. my $epsilon = 1E-8; my $required_precision = 1E-12; my @mention_types = sort {$mention_type_wgt{$b} <=> $mention_type_wgt{$a} ? $mention_type_wgt{$b} <=> $mention_type_wgt{$a} : $a cmp $b} keys %mention_type_wgt; my @entity_types = sort {$entity_type_wgt{$b} <=> $entity_type_wgt{$a} ? $entity_type_wgt{$b} <=> $entity_type_wgt{$a} : $a cmp $b} keys %entity_type_wgt; my @entity_classes = sort {$entity_class_wgt{$b} <=> $entity_class_wgt{$a} ? $entity_class_wgt{$b} <=> $entity_class_wgt{$a} : $a cmp $b} keys %entity_class_wgt; my @relation_types = sort {$relation_type_wgt{$b} <=> $relation_type_wgt{$a} ? $relation_type_wgt{$b} <=> $relation_type_wgt{$a} : $a cmp $b} keys %relation_type_wgt; my @event_types = sort {$event_type_wgt{$b} <=> $event_type_wgt{$a} ? $event_type_wgt{$b} <=> $event_type_wgt{$a} : $a cmp $b} keys %event_type_wgt; my @event_modalities = sort {$event_modality_wgt{$b} <=> $event_modality_wgt{$a} ? $event_modality_wgt{$b} <=> $event_modality_wgt{$a} : $a cmp $b} keys %event_modality_wgt; my %entity_subtypes = (PERSON => {"" => 1}, ORGANIZATION => {Government => 1, Commercial => 1, Educational => 1, "Non-Profit" => 1, Other => 1}, LOCATION => {Address => 1, Boundary => 1, Celestial => 1, "Water-Body" => 1, "Land-Region-Natural" => 1, "Region-Local" => 1, "Region-Subnational" => 1, "Region-National" => 1, "Region-International" => 1, Other => 1}, GPE => {Continent => 1, Nation => 1, "State-or-Province" => 1, "County-or-District" => 1, "Population-Center" => 1, Other => 1}, FACILITY => {Building => 1, "Subarea-Building" => 1, "Bounded-Area" => 1, Conduit => 1, Path => 1, Barrier => 1, Plant => 1, Other => 1}, VEHICLE => {Land => 1, Air => 1, Water => 1, "Subarea-Vehicle" => 1, Other => 1}, WEAPON => {Blunt => 1, Exploding => 1, Sharp => 1, Chemical => 1, Biological => 1, Shooting => 1, Projectile => 1, Nuclear => 1, Other => 1}, TMP => {"" => 1}); my %relation_subtypes = (PHYS => {Located => 1, Near => 1, "Part-Whole" => 1}, "PER-SOC" => {Business => 1, Family => 1, Other => 1}, "EMP-ORG" => {"Employ-Executive" => 1, "Employ-Staff" => 1, "Employ-Undetermined" => 1, "Member-of-Group" => 1, Subsidiary => 1, Partner => 1, Other => 1}, ART => {"User-or-Owner" => 1, "Inventor-or-Manufacturer" => 1, Other => 1}, "OTHER-AFF" => {Ethnic => 1, Ideology => 1, Other => 1}, "GPE-AFF" => {"Citizen-or-Resident" => 1, "Based-In" => 1, Other => 1}, DISC => {"" => 1}, METONYMY => {"" => 1}); my %relation_symmetry = ("PHYS" => {Near => 1}, "PER-SOC" => {Business => 1, Family => 1, Other => 1}, "EMP-ORG" => {Partner => 1, Other => 1}, "OTHER-AFF" => {Other => 1}); my @relation_time_attributes = ("TYPE", "VAL", "MOD", "DIR"); ################################# # GLOBAL DATA use vars qw (%ref_database %tst_database %eval_docs %sys_docs); use vars qw ($print_entities $print_relations $print_events $print_timex2 $print_all_data $print_err_data); use vars qw ($input_file $input_doc $input_entity $input_relation $input_event $input_timex2); use vars qw ($usage); use vars qw ($best_score); use vars qw (%entity_type_statistics %entity_subtype_statistics %entity_class_statistics); use vars qw (%mention_detection_statistics %attribute_confusion_statistics); use vars qw (%mention_role_statistics %mention_style_statistics); use vars qw (%name_detection_statistics); use vars qw (%mapped_entity_value %mapped_entity_document_value); use vars qw (%mapped_relation_value %mapped_relation_document_value); use vars qw (%mapped_event_value %mapped_event_document_value); use vars qw (%mapped_timex2_value %mapped_timex2_document_value); use vars qw (%relation_type_statistics %relation_subtype_statistics %relation_class_statistics); use vars qw (%source_types $source_type $data_type); use vars qw ($total_words); use vars qw ($t_start_mapping); use vars qw (@entity_attributes); use vars qw (@error_types @xdoc_types @entity_count_types); use vars qw (@entity_value_types @entity_classes @entity_style_types); my @origins = ("DATABASE", "CORPUS"); @error_types = ("correct", "miss", "fa", "error"); @entity_attributes = ("ID", "TYPE", "SUBTYPE", "CLASS", "ORIGIN", "COUNTRY", "CONTINENT", "NATIONALITY"); @xdoc_types = ("1", ">1"); @entity_value_types = ("<0.1", "0.1-0.3", "0.3-1.0", "1-3", "3-10", ">10"); @entity_count_types = ("1", "2", "3-4", "5-8", ">8"); @entity_style_types = ("LITERAL", "METONYMIC"); use vars qw (@relation_attributes); use vars qw (@relation_class_types @relation_count_types @relation_argument_errors); @relation_attributes = ("ID", "TYPE", "SUBTYPE", "CLASS", "ORIGIN"); @relation_count_types = ("1", ">1"); @relation_class_types = ("EXPLICIT", "IMPLICIT"); @relation_argument_errors = ("0", "1", ">1"); my %normalize_relation_type; foreach my $type (@relation_types) { $normalize_relation_type{$type} = $type; } my %normalize_relation_class; foreach my $class (@relation_class_types) { $normalize_relation_class{$class} = $class; } use vars qw (@event_attributes); use vars qw (@event_count_types @event_participant_errors); @event_attributes = ("ID", "TYPE", "MODALITY", "ORIGIN"); @event_count_types = ("1", ">1"); @event_participant_errors = ("0", "1", ">1"); my %normalize_event_participant_role = (Agent => "Agent", Object => "Object", Source => "Source", Target => "Target", Location => "Location", Time => "Time", Modifier => "Modifier"); my @event_participant_roles = sort keys %normalize_event_participant_role; use vars qw (%normalize_event_mention_type %normalize_event_mention_source); %normalize_event_mention_type = (NOM => "nominal", SEN => "clausal"); %normalize_event_mention_source = (Event => "Event", Relation => "Relation"); use vars qw (%event_type_statistics %event_modality_statistics %participant_role_statistics); my $max_string_length = 40; my ($entity_serial_number, $relation_serial_number, $event_serial_number); my ($score_bound, $max_delta); my ($date, $time) = date_time_stamp(); print "ACE evaluation run on $date at $time\n"; print "command line: ", $0, " ", join(" ", @ARGV), "\n"; $usage = "\n\n$0 [-bghsaxemnid] -r -t \n\n". "Description: This Perl program evaluates ACE system performance.\n". "\n". "Required arguments:\n". " -r is a file containing a list of ACE reference data for\n". " all documents to be evaluated. These data must be in standard\n". " ACE apf format\n". " -t is a file containing a list of system ACE output data\n". " for all documents to be evaluated. These data must be in standard\n". " ACE apf format\n". "\n". "Optional arguments:\n". " -b is a file containing an ACE database\n". " of entities, relations and events\n". " -g includes GENERIC entities\n". " -h prints this help message to STDOUT\n". " -s prints ACE annotation data summary to STDOUT\n". " -m controls scoring mode by providing a selection of\n". " different parameters (pre)defined in named parameter sets\n". "\n". " The following flags control the output of comparison data for diagnosis:\n". " -a outputs report data for all entities, relations and events\n". " -e outputs report data for entities, relations and events with errors\n". "\n"; ################################# #MAIN { use vars qw ($opt_b $opt_g $opt_h $opt_s $opt_a $opt_e); use vars qw ($opt_r $opt_t $opt_d $opt_m); use Getopt::Std; getopts ('ghsaedr:t:b:m:'); die $usage if defined($opt_h); $print_entities = defined($opt_s); $print_relations = defined($opt_s); $print_events = defined($opt_s); $print_timex2 = defined($opt_s); $print_all_data = defined($opt_a); $print_err_data = defined($opt_e); defined($opt_r) and defined($opt_t) or die "Error in command line$usage"; $entity_class_wgt{GENERIC} = $entity_class_wgt{SPECIFIC} if $opt_g; select_parameter_set ($opt_m ? $opt_m : "DEFAULT"); print_parameters (); #read in the data my $t0 = (times)[0]; get_database ($opt_b) if $opt_b; get_data ("REF", \%ref_database, \%eval_docs, $opt_r); get_data ("TST", \%tst_database, \%sys_docs, $opt_t); check_docs (); #evaluate entities my $t1 = (times)[0]; compute_entity_values (); my $t2 = (times)[0]; map_entities (); my $t3 = (times)[0]; if ((keys %{$ref_database{entities}})>0) { print_entity_mapping (\%ref_database, \%tst_database, "entity mapping") if ((keys %{$tst_database{entities}})>0 and ($print_all_data or $print_err_data)); score_entity_detection ("entity scoring"); score_entity_attribute_recognition (); score_mention_detection (); score_mention_attribute_recognition (); } #evaluate relations my $t4 = (times)[0]; compute_relation_values (); my $t5 = (times)[0]; map_relations (); my $t6 = (times)[0]; if ((keys %{$ref_database{relations}})>0) { print_relation_mapping (\%ref_database, \%tst_database, "relation mapping") if ((keys %{$tst_database{relations}})>0 and ($print_all_data or $print_err_data)); score_relation_detection ("relation scoring"); score_relation_attribute_recognition (); } #evaluate events my $t7 = (times)[0]; compute_event_values (); my $t8 = (times)[0]; map_events (); my $t9 = (times)[0]; if ((keys %{$ref_database{events}})>0) { print_event_mapping (\%ref_database, \%tst_database, "event mapping") if ((keys %{$tst_database{events}})>0 and ($print_all_data or $print_err_data)); score_event_detection ("event scoring"); score_event_attribute_recognition (); } #evaluate entity mentions without regard to entity affiliation my $t10 = (times)[0]; $ref_database{entities} = $ref_database{mention_entities}; $tst_database{entities} = $tst_database{mention_entities}; compute_entity_values (); my $t11 = (times)[0]; map_entities (); my $t12 = (times)[0]; if ((keys %{$ref_database{entities}})>0) { print_entity_mapping (\%ref_database, \%tst_database, "entity mention mapping") if ((keys %{$tst_database{entities}})>0 and ($print_all_data or $print_err_data)); score_entity_detection ("entity mention scoring"); score_entity_attribute_recognition (); } #evaluate relation mentions without regard to relation affiliation my $t13 = (times)[0]; $ref_database{relations} = $ref_database{mention_relations}; $tst_database{relations} = $tst_database{mention_relations}; compute_relation_values (); my $t14 = (times)[0]; map_relations (); my $t15 = (times)[0]; if ((keys %{$ref_database{relations}})>0) { print_relation_mapping (\%ref_database, \%tst_database, "relation mention mapping") if ((keys %{$tst_database{relations}})>0 and ($print_all_data or $print_err_data)); score_relation_detection ("relation mention scoring"); score_relation_attribute_recognition (); } #evaluate timex2 expressions my $t16 = (times)[0]; compute_timex2_values (); my $t17 = (times)[0]; map_timex2s (); my $t18 = (times)[0]; if ((keys %{$ref_database{timex2s}})>0) { print_timex2_mapping (\%ref_database, \%tst_database, "timex2 mapping") if ((keys %{$tst_database{timex2s}})>0 and ($print_all_data or $print_err_data)); score_timex2_detection ("timex2 scoring"); score_timex2_attribute_recognition (); score_timex2_mention_detection (); } my $t19 = (times)[0]; printf "\ndata input: %8.2f secs to load data\n", $t1-$t0; printf "entity eval: %8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t2-$t1, $t3-$t2, $t4-$t3; printf "relation eval: %8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t5-$t4, $t6-$t5, $t7-$t6; printf "event eval: %8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t8-$t7, $t9-$t8, $t10-$t9; printf "entity mention eval: %8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t11-$t10, $t12-$t11, $t13-$t12; printf "relation mention eval:%8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t14-$t13, $t15-$t14, $t16-$t15; printf "timex2 eval: %8.2f secs to compute values,%8.2f secs to map,%8.2f secs to score\n", $t17-$t16, $t18-$t17, $t19-$t18; } ################################# sub get_data { my ($label, $db, $docs, $list) = @_; open (LIST, $list) or die "\nUnable to open file list '$list'", $usage; while ($input_file = ) { chomp $input_file; get_document_data ($db, $docs, $input_file); } close (LIST); check_relation_data ($db); print_document_data ($label, $docs) if $opt_s; print_entity_data ($label, $db) if $print_entities; print_relation_data ($label, $db) if $print_relations; print_event_data ($label, $db) if $print_events; print_timex2_data ($label, $db) if $print_timex2; } ################################# sub check_relation_data { my ($db) = @_; my ($id, $relation, $arg); foreach $id (keys %{$db->{relations}}) { $relation = $db->{relations}{$id}; foreach $arg (@{$relation->{arguments}}) { exists $db->{entities}{$arg->{ID}} or die "\n\nFATAL INPUT ERROR: relation '$relation->{ID}' references a non-existent argument". " ('$arg->{ID}')\n\n"; } } } ################################# sub select_parameter_set { my ($name) = @_; my ($p, $type); $name = uc $name; if (not defined $parameter_set{$name}) { print STDERR "\n\nFATAL ERROR: unknown parameter set name ($name)\n". " available parameter set names are:\n"; foreach $name (sort keys %parameter_set) { printf STDERR " %s\n", $name; } print STDERR "\n"; die; } #Entity parameters foreach $type (@mention_types) { $p = $parameter_set{$name}{mention_type_wgt}{$type}; $mention_type_wgt{$type} = $p if defined $p; } foreach $type (@entity_types) { $p = $parameter_set{$name}{entity_type_wgt}{$type}; $entity_type_wgt{$type} = $p if defined $p; } foreach $type (keys %{$parameter_set{$name}{entity_err_wgt}}) { $p = $parameter_set{$name}{entity_err_wgt}{$type}; $entity_err_wgt{$type} = $p if defined $p; } foreach $type (keys %{$parameter_set{$name}{entity_mention_err_wgt}}) { $p = $parameter_set{$name}{entity_mention_err_wgt}{$type}; $entity_mention_err_wgt{$type} = $p if defined $p; } $p = $parameter_set{$name}{entity_fa_wgt}; $entity_fa_wgt = $p if defined $p; $p = $parameter_set{$name}{entity_mention_fa_wgt}; $entity_mention_fa_wgt = $p if defined $p; $p = $parameter_set{$name}{entity_mention_ref_fa_wgt}; $entity_mention_ref_fa_wgt = $p if defined $p; #Relation parameters foreach $type (keys %{$parameter_set{$name}{relation_type_wgt}}) { $p = $parameter_set{$name}{relation_type_wgt}{$type}; $relation_type_wgt{$type} = $p if defined $p; } foreach $type (keys %{$parameter_set{$name}{relation_err_wgt}}) { $p = $parameter_set{$name}{relation_err_wgt}{$type}; $relation_err_wgt{$type} = $p if defined $p; } $p = $parameter_set{$name}{relation_fa_wgt}; $relation_fa_wgt = $p if defined $p; $p = $parameter_set{$name}{relation_argument_threshold}; $relation_argument_threshold = $p if defined $p; #Event parameters foreach $type (keys %{$parameter_set{$name}{event_type_wgt}}) { $p = $parameter_set{$name}{event_type_wgt}{$type}; $event_type_wgt{$type} = $p if defined $p; } foreach $type (keys %{$parameter_set{$name}{event_modality_wgt}}) { $p = $parameter_set{$name}{event_modality_wgt}{$type}; $event_modality_wgt{$type} = $p if defined $p; } foreach $type (keys %{$parameter_set{$name}{event_err_wgt}}) { $p = $parameter_set{$name}{event_err_wgt}{$type}; $event_err_wgt{$type} = $p if defined $p; } $p = $parameter_set{$name}{event_participant_role_err_wgt}; $event_participant_role_err_wgt = $p if defined $p; $p = $parameter_set{$name}{event_fa_wgt}; $event_fa_wgt = $p if defined $p; $p = $parameter_set{$name}{event_participant_fa_wgt}; $event_participant_fa_wgt = $p if defined $p; $p = $parameter_set{$name}{event_participant_threshold}; $event_participant_threshold = $p if defined $p; #Timex2 parameters $p = $parameter_set{$name}{timex2_detection_wgt}; $timex2_detection_wgt = $p if defined $p; foreach $type (keys %{$parameter_set{$name}{timex2_attribute_wgt}}) { $p = $parameter_set{$name}{timex2_attribute_wgt}{$type}; $timex2_attribute_wgt{$type} = $p if defined $p; } $p = $parameter_set{$name}{timex2_fa_wgt}; $timex2_fa_wgt = $p if defined $p; } ################################# sub print_parameters { printf "PARAMETERS:\n". " min mutual fractional overlap of matching mention heads or names:\n". "%11.3f\n". " max allowable extent difference for names and mentions to match:\n". "%11d chars for text sources\n". "%11.3f sec for audio sources\n". "%11.3f cm for image sources\n", $min_overlap, $max_diff_chars, $max_diff_time, $max_diff_xy; #Entity parameters print "\n"; print " Entity mention values:\n"; foreach my $type (@mention_types) { printf "%11.3f for type %s\n", $mention_type_wgt{$type}, $type; } print " Entity value weights for entity types:\n"; foreach my $type (@entity_types) { printf "%11.3f for type %s\n", $entity_type_wgt{$type}, $type; } print " Entity value weights for entity classes:\n"; foreach my $class (@entity_classes) { printf "%11.3f for class %s\n", $entity_class_wgt{$class}, $class; } print " Entity value discounts for entity attribute recognition errors:\n"; foreach my $type (sort keys %entity_err_wgt) { printf "%11.3f for $type errors\n", $entity_err_wgt{$type}; } print " Entity mention value discounts for mention attribute recognition errors:\n"; foreach my $type (sort keys %entity_mention_err_wgt) { printf "%11.3f for $type errors\n", $entity_mention_err_wgt{$type}; } printf " Entity value (cost) weight for spurious (false alarm) entities:%6.3f\n", $entity_fa_wgt; printf " Entity mention value (cost) weight for spurious entity mentions:%6.3f\n", $entity_mention_fa_wgt; printf " Entity mention value (cost) discount for incorrect coreference:%6.3f\n", $entity_mention_ref_fa_wgt; #Relation parameters print "\n"; print " Relation value weights for relation types:\n"; foreach my $type (@relation_types) { printf "%11.3f for type %s\n", $relation_type_wgt{$type}, $type; } print " Relation value discounts for relation attribute recognition errors:\n"; foreach my $type (sort keys %relation_err_wgt) { printf "%11.3f for $type errors\n", $relation_err_wgt{$type}; } printf " Relation value (cost) weight for spurious (false alarm) relations:%6.3f\n", $relation_fa_wgt; printf " Minimum fractional value for candidate relation arguments:%6.3f\n", $relation_argument_threshold; #Event parameters print "\n"; print " Event value weights for event types:\n"; foreach my $type (@event_types) { printf "%11.3f for type %s\n", $event_type_wgt{$type}, $type; } print " Event value weights for event modalities:\n"; foreach my $type (@event_modalities) { printf "%11.3f for modality %s\n", $event_modality_wgt{$type}, $type; } print " Event value discounts for event attribute recognition errors:\n"; foreach my $type (sort keys %event_err_wgt) { printf "%11.3f for $type errors\n", $event_err_wgt{$type}; } printf " Event value (cost) weight for spurious (false alarm) events:%6.3f\n", $event_fa_wgt; printf " Event value (cost) weight for spurious event participants:%6.3f\n", $event_participant_fa_wgt; printf " Event participant value (cost) discount for participant role errors:%6.3f\n", $event_participant_role_err_wgt; printf " Minimum fracional value for candidate event participants:%6.3f\n", $event_participant_threshold; #Timex2 parameters print "\n"; print " Timex2 attribute value weights for timex2 attributes:\n"; printf "%11.3f for timex2 detection\n", $timex2_detection_wgt; foreach my $type (sort keys %timex2_attribute_wgt) { printf "%11.3f for type %s\n", $timex2_attribute_wgt{$type}, $type; } printf " Timex2 value (cost) weight for spurious (false alarm) timex2's:%6.3f\n", $timex2_fa_wgt; print "\n"; } ################################# sub check_docs { my ($doc_id, $eval_doc, $sys_doc); foreach $doc_id (keys %eval_docs) { $eval_doc = $eval_docs{$doc_id}; $sys_doc = $sys_docs{$doc_id}; $sys_doc or warn "\n\nWARNING: ref doc '$doc_id' has no corresponding tst doc\n\n"; next unless $sys_doc; $sys_doc->{TYPE} eq $eval_doc->{TYPE} or die "\n\nFATAL ERROR: different reference and system output data types for document '$doc_id'\n". " ref type is '$eval_doc->{TYPE}' but system output type is '$sys_doc->{TYPE}'\n\n"; } } ################################# sub score_entity_detection { print "\n======== $_[0] ========\n"; edt_eval_1_condition ("entity", "type", "ENTITY TYPE", \@entity_types); edt_eval_1_condition ("entity", "value", "ENTITY VALUE", \@entity_value_types); edt_eval_1_condition ("mention", "count", "COUNT TYPE", \@entity_count_types); edt_eval_1_condition ("class", "type", "ENTITY CLASS", \@entity_classes); my @types = sort keys %source_types; edt_eval_1_condition ("source", "type", "SOURCE TYPE", \@types); # edt_eval_2_conditions ("Ndoc", "type", "CROSS-DOC TYPE", \@xdoc_types, # "entities of type", "ENTITY TYPE", \@entity_types); # edt_eval_2_conditions ("entity", "type", "ENTITY TYPE", \@entity_types, # "entities of origin", "ENTITY ORIGIN", \@origins); # link_eval_2_conditions ("entity", "type", "ENTITY TYPE", \@entity_types, # "entities of origin", "ENTITY ORIGIN", \@origins); } ################################# sub edt_eval_1_condition { my ($label1, $label2, $condition, $types) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Ent Detection Type Detection Type Unweighted Detection Type Type Value Value-based Max Detection Type Type"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_EDT_statistics ($condition); print "\nEntity Detection and Type Recognition statistics for \"$source_type\" sources:\n"; printf " ref %s\n %-8s %s\n %-8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types, "total") { print_eval ($type, $count, $cost, $nrm_cost, $nrm_cost->{total}, $entity_fa_wgt); } } ################################# sub print_eval { my ($type, $count, $cost, $ref_value, $total_value, $fa_wgt) = @_; my $format = "%7.7s%6d%6d%6d%6d%8.1f%6.1f%6.1f%7.1f%5.1f%5.1f%8.1f%6.1f%6.1f%6.1f%8.1f%7.1f%5.1f%5.1f%9.2f%7.2f%7.2f%7.2f%7.2f\n"; $count = $count->{$type}; $cost = $cost->{$type}; $ref_value = $ref_value->{$type}; my $nref = $count->{correct}+$count->{error}+$count->{miss}; my $nsys = $count->{correct}+$count->{error}+$count->{fa}; my $pn = 100/max(1E-30, $nref); my $cn = 100/max(1E-30, $ref_value); my $recall = ($count->{correct}+0.5*$count->{error})/max($nref,1E-30); my $precision = ($count->{correct}+0.5*$count->{error})/max($nsys,1E-30); my $fmeasure = 2*$precision*$recall/max($precision+$recall, 1E-30); my $value_correct = $ref_value-$cost->{miss}-$cost->{error}-$cost->{correct}; my $value_recall = ($value_correct+0.5*($cost->{error}+$cost->{correct}))/max(1E-30, $ref_value); my $sys_value = $ref_value-$cost->{miss}+$cost->{fa}/max($fa_wgt,1E-30); my $value_precision = ($value_correct+0.5*($cost->{error}+$cost->{correct}))/max(1E-30, $sys_value); my $value_fmeasure = 2*$value_precision*$value_recall/max($value_precision+$value_recall, 1E-30); my $un = 100/max($total_value,1E-30); printf $format, $type, $nref, $count->{fa}, $count->{miss}, $count->{error}, min(999.9,$pn*$count->{fa}), $pn*$count->{miss}, $pn*$count->{error}, 100*$precision, 100*$recall, 100*$fmeasure, min(999.9,$cn*$cost->{fa}), $cn*$cost->{miss}, $cn*$cost->{error}, $cn*$cost->{correct}, max(-999.9,$cn*($value_correct-$cost->{fa})), 100*$value_precision, 100*$value_recall, 100*$value_fmeasure, $un*$ref_value, min(999.99,$un*$cost->{fa}), $un*$cost->{miss}, $un*$cost->{error}, $un*$cost->{correct}; } ################################# sub edt_eval_2_conditions { my ($label1, $label2, $condition1, $types1, $label3, $condition2, $types2) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Ent Detection Type Detection Type Unweighted Detection Type Type Value Value-based Max Detection Type Type"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_EDT_statistics ($condition1, $condition2); foreach my $cond ("ALL", @$types2) { print "\nEntity Detection and Type Recognition statistics for \"$source_type\" sources and $label3 \"$cond\":\n"; printf " ref %s\n %-8s %s\n %-8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types1, "total") { print_eval ($type, $count->{$cond}, $cost->{$cond}, $nrm_cost->{$cond}, $nrm_cost->{ALL}{total}, $entity_fa_wgt); } } } ################################# sub link_eval_2_conditions { my ($label1, $label2, $condition1, $types1, $label3, $condition2, $types2) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Ent Detection Link Detection Link Unweighted Detection Link Link Value Value-based Max Detection Link Link"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_entity_link_statistics ($condition1, $condition2); foreach my $cond ("ALL", @$types2) { print "\nEntity Detection and Database Linking statistics for \"$source_type\" sources and $label3 \"$cond\":\n"; printf " ref %s\n %-8s %s\n %-8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types1, "total") { print_eval ($type, $count->{$cond}, $cost->{$cond}, $nrm_cost->{cond}, $nrm_cost->{ALL}{total}, $entity_fa_wgt); } } } ################################# sub score_relation_detection { print "\n======== $_[0] ========\n"; rdc_eval_1_condition ("relation", "type", "RELATION TYPE", \@relation_types); rdc_eval_1_condition ("class", "type", "RELATION CLASS", \@relation_class_types); rdc_eval_1_condition ("mention", "count", "COUNT TYPE", \@relation_count_types); my @types = sort keys %source_types; rdc_eval_1_condition ("source", "type", "SOURCE TYPE", \@types); rdc_eval_1_condition ("argument", "errors", "ARGUMENT ERRORS", \@relation_argument_errors); # rdc_eval_2_conditions ("relation", "type", "RELATION TYPE", \@relation_types, # "argument errors of type", "ARGUMENT ERRORS", \@relation_argument_errors); } ################################# sub rdc_eval_1_condition { my ($label1, $label2, $condition, $types) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Rel Detection ATSt Detection ATSt Unweighted Detection ATSt ATSt Value Value-based Max Detection ATSt ATSt"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_RDC_statistics ($condition); print "\nRelation Detection and Type Recognition statistics for \"$source_type\" sources:\n"; printf " ref %s\n %-8s %s\n %-8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types, "total") { print_eval ($type, $count, $cost, $nrm_cost, $nrm_cost->{total}, $relation_fa_wgt); } } ################################# sub rdc_eval_2_conditions { my ($label1, $label2, $condition1, $types1, $label3, $condition2, $types2) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Rel Detection ATSt Detection ATSt Unweighted Detection ATSt ATSt Value Value-based Max Detection ATSt ATSt"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_RDC_statistics ($condition1, $condition2); foreach my $cond ("ALL", @$types2) { print "\nRelation Detection and Type Recognition statistics for \"$source_type\" sources and $label3 \"$cond\":\n"; printf " ref %s\n %-8s %s\n %-8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types1, "total") { print_eval ($type, $count->{$cond}, $cost->{$cond}, $nrm_cost->{$cond}, $nrm_cost->{ALL}{total}, $relation_fa_wgt); } } } ################################# sub score_event_detection { print "\n======== $_[0] ========\n"; edc_eval_1_condition ("event", "type", "EVENT TYPE", \@event_types); edc_eval_1_condition ("modality", "type", "EVENT MODALITY", \@event_modalities); edc_eval_1_condition ("mention", "count", "COUNT TYPE", \@event_count_types); my @types = sort keys %source_types; edc_eval_1_condition ("source", "type", "SOURCE TYPE", \@types); edc_eval_1_condition ("participant", "errors", "PARTICIPANT ERRORS", \@event_participant_errors); # edc_eval_2_conditions ("event", "type", "EVENT TYPE", \@event_types, # "participant errors of type", "PARTICIPANT ERRORS", \@event_participant_errors); } ################################# sub edc_eval_1_condition { my ($label1, $label2, $condition, $types) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Evt Detection PTM Detection PTM Unweighted Detection PTM PTM Value Value-based Max Detection PTM PTM"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_EDC_statistics ($condition); print "\nEvent Detection and Type Recognition statistics for \"$source_type\" sources:\n"; printf " ref %s\n %-8.8s %s\n %-8.8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types, "total") { print_eval ($type, $count, $cost, $nrm_cost, $nrm_cost->{total}, $event_fa_wgt); } } ################################# sub edc_eval_2_conditions { my ($label1, $label2, $condition1, $types1, $label3, $condition2, $types2) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Evt Detection PTM Detection PTM Unweighted Detection PTM PTM Value Value-based Max Detection PTM PTM"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_EDC_statistics ($condition1, $condition2); foreach my $cond ("ALL", @$types2) { print "\nEvent Detection and Type Recognition statistics for \"$source_type\" sources and $label3 \"$cond\":\n"; printf " ref %s\n %-8.8s %s\n %-8.8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types1, "total") { print_eval ($type, $count->{$cond}, $cost->{$cond}, $nrm_cost->{$cond}, $nrm_cost->{ALL}{total}, $event_fa_wgt); } } } ################################# sub score_timex2_detection { print "\n======== $_[0] ========\n"; my @types = sort keys %source_types; timex2_eval_1_condition ("source", "type", "SOURCE TYPE", \@types); } ################################# sub timex2_eval_1_condition { my ($label1, $label2, $condition, $types) = @_; my ($type, $count, $cost, $nrm_cost); my $hdr1 = "________Count________ __________Count_(%)__________ _________________Cost_(%)___________________ _____Unconditioned_Cost_(%)_____"; my $hdr2 = "Tmx Detection Atr Detection Atr Unweighted Detection Atr Atr Value Value-based Max Detection Atr Atr"; my $hdr3 = "Tot FA Miss Err FA Miss Err Pre--Rec--F FA Miss Err Corr (%) Pre--Rec--F Value FA Miss Err Corr"; ($count, $cost, $nrm_cost) = conditional_TIMEX2_statistics ($condition); print "\nTimex2 Detection and Type Recognition statistics for \"$source_type\" sources:\n"; printf " ref %s\n %-8.8s %s\n %-8.8s %s\n", $hdr1, $label1, $hdr2, $label2, $hdr3; foreach $type (@$types, "total") { print_eval ($type, $count, $cost, $nrm_cost, $nrm_cost->{total}, $timex2_fa_wgt); } } ################################# sub score_timex2_attribute_recognition { my ($attribute, $ref_values, $ref_value, $tst_values, $tst_value); my $maxdisplay = 8; #display dominant confusion statistics for each timex2 attribute foreach $attribute (sort keys %attribute_confusion_statistics) { my (%ref_count, %tst_count, %sort_count, @display_values, $ndisplay); my ($count, $ntot, $nref, $ncor, $nfa, $nmiss); $ntot = $nref = $ncor = $nfa = $nmiss = 0; $ref_values = $attribute_confusion_statistics{$attribute}; #select attribute values that contribute the most confusions foreach $ref_value (keys %$ref_values) { $tst_values = $ref_values->{$ref_value}; foreach $tst_value (keys %$tst_values) { $count = $tst_values->{$tst_value}; $ref_count{$ref_value} += $count; $tst_count{$tst_value} += $count; $ntot += $count; $nref += $count unless $ref_value eq ""; if ($tst_value eq $ref_value) { $sort_count{$ref_value} += $epsilon*$count; $ncor += $count unless $ref_value eq ""; } else { $sort_count{$tst_value} += $count; $sort_count{$ref_value} += $count; $nfa += $count if $ref_value eq ""; $nmiss += $count if $tst_value eq ""; } } } @display_values = sort {$sort_count{$b} <=> $sort_count{$a}} keys %sort_count; $ndisplay = min($maxdisplay, scalar @display_values); splice (@display_values, $ndisplay); #tabulate confusion statistics for "other" attribute values my $others = "all others"; foreach my $value (@display_values, $others) { $ref_values->{$value}{$others} = $ref_count{$value}; $ref_values->{$others}{$value} = $tst_count{$value}; } $ref_values->{$others}{$others} = $ntot; foreach $ref_value (@display_values) { foreach $tst_value (@display_values) { $count = $ref_values->{$ref_value}{$tst_value}; next unless $count; $ref_values->{$ref_value}{$others} -= $count; $ref_values->{$others}{$tst_value} -= $count; $ref_values->{$others}{$others} -= $count; } } #output results print "\nTimex2 recognition statistics for attribute $attribute (for mapped timex2's):\n"; my $nerr = $nfa+$nref-$ncor; my $nsub = $nref-$nmiss-$ncor; my $nsys = $ncor+$nsub+$nfa; $nref = max($nref,$epsilon); $nsys = max($nsys,$epsilon); my $pfa = $nfa/$nref; my $psub = $nsub/$nref; my $pmiss = $nmiss/$nref; my $perror = $nerr/$nref; my $recall = ($ncor+0.5*$nsub)/$nref; my $precision = ($ncor+0.5*$nsub)/$nsys; my $f = 2*$precision*$recall/max($precision+$recall,$epsilon); printf " Summary (count/percent): Nref=%d/%.1f%s, Nfa=%d/%.1f%s, Nmiss=%d/%.1f%s, Nsub=%d/%.1f%s, Nerr=%d/%.1f%s" .", P/R/F=%.1f%s/%.1f%s/%.1f%s\n", $nref, 100, "%", $nfa, min(999.9,100*$pfa), "%", $nmiss, 100*$pmiss, "%", $nsub, min(999.9,100*$psub), "%", $nerr, min(999.9,100*$perror), "%", 100*$precision, "%", 100*$recall, "%", 100*$f, "%"; print " Confusion matrix for major error contributors (count/percent):\n ref\\tst:"; push @display_values, $others; foreach $tst_value (@display_values) { printf "%11.11s ", $tst_value; } print "\n"; foreach $ref_value (@display_values) { printf " %14.14s", $ref_value; foreach $tst_value (@display_values) { $count = $ref_values->{$ref_value}{$tst_value}; printf "%s", $count ? (sprintf "%6d/%4.1f%s", $count, min(99.9,100*$count/max($ntot,$epsilon)), "%") : " - "; } print "\n"; } } } ################################# sub score_timex2_mention_detection { my (%men_count, $type, $men_type, $rol_type, $sty_type, $err_type); my ($pn); #scoring conditioned on mention type undef %men_count; foreach $err_type (@error_types) { my $nent = $mention_detection_statistics{$err_type}; $men_count {total}{$err_type} += $nent ? $nent : 0; } print "\nTimex2 Mention Detection and EXACT Extent Recognition statistics for \"$source_type\" sources (for mapped timex2's):\n", " ____________count______________ ____________percent____________\n", " Detection Extent_Recognition Detection Extent_Recognition\n", " miss fa miss err corr miss fa miss err corr\n"; foreach $type ("total") { $pn = 100/max($epsilon, $men_count{$type}{miss}+$men_count{$type}{error}+$men_count{$type}{correct}); printf "%5.5s%8d%6d%9d%6d%6d%11.1f%6.1f%9.1f%6.1f%6.1f\n", $type, $men_count{$type}{miss}, $men_count{$type}{fa}, $men_count{$type}{miss}, $men_count{$type}{error}, $men_count{$type}{correct}, $pn*$men_count{$type}{miss}, min(999.9,$pn*$men_count{$type}{fa}), $pn*$men_count{$type}{miss}, $pn*$men_count{$type}{error}, $pn*$men_count{$type}{correct}; } } ################################# sub score_entity_attribute_recognition { # type attributes my ($type, $ref_type); my (%entity_type_total); foreach $ref_type (@entity_types) { foreach $type (@entity_types) { $entity_type_statistics{$ref_type}{$type} = 0 unless defined $entity_type_statistics{$ref_type}{$type}; $entity_type_total{$ref_type} += $entity_type_statistics{$ref_type}{$type}; } } print "\nEntity Type confusion matrix for \"$source_type\" sources (for mapped entities):\n", " ___________count___________ __________percent__________\n", " ref\\tst: "; foreach $type (@entity_types) { printf " %3.3s ", $type; } print " "; foreach $type (@entity_types) { printf " %3.3s", $type; } print "\n"; foreach $ref_type (@entity_types) { printf " %3.3s ", $ref_type; foreach $type (@entity_types) { printf "%6d", $entity_type_statistics{$ref_type}{$type}; } print " "; foreach $type (@entity_types) { printf "%6.1f", 100*$entity_type_statistics{$ref_type}{$type} / max($entity_type_total{$ref_type},1); } print "\n"; } # class attributes my ($class, $ref_class); my (%entity_class_total); foreach $ref_class (@entity_classes) { foreach $class (@entity_classes) { $entity_class_statistics{$ref_class}{$class} = 0 unless defined $entity_class_statistics{$ref_class}{$class}; $entity_class_total{$ref_class} += $entity_class_statistics{$ref_class}{$class}; } } print "\nEntity Class confusion matrix for \"$source_type\" sources (for mapped entities):\n", " __count__ _percent_\n", " ref\\tst: "; foreach $class (@entity_classes) { printf " %3.3s ", $class; } print " "; foreach $class (@entity_classes) { printf " %3.3s", $class; } print "\n"; foreach $ref_class (@entity_classes) { printf "%5.5s ", $ref_class; foreach $class (@entity_classes) { printf "%6d", $entity_class_statistics{$ref_class}{$class}; } print " "; foreach $class (@entity_classes) { printf "%6.1f", 100*$entity_class_statistics{$ref_class}{$class} / max($entity_class_total{$ref_class},1); } print "\n"; } # name attributes foreach $type (@entity_types) { foreach my $err (@error_types) { $name_detection_statistics{$type}{$err} = 0 unless defined $name_detection_statistics {$type}{$err}; } $name_detection_statistics{total}{miss} += $name_detection_statistics{$type}{miss}; $name_detection_statistics{total}{fa} += $name_detection_statistics{$type}{fa}; $name_detection_statistics{total}{correct} += $name_detection_statistics{$type}{correct}; $name_detection_statistics{total}{error} += $name_detection_statistics{$type}{error}; } print "\nName Detection and Extent Recognition statistics for \"$source_type\" sources (for mapped entities):\n", " ref ____________count______________ ____________percent____________\n", " entity Detection Extent_Recognition Detection Extent_Recognition\n", " type miss fa miss err corr miss fa miss err corr\n"; foreach $type (@entity_types, "total") { my $total = ($name_detection_statistics{$type}{miss} + $name_detection_statistics{$type}{error} + $name_detection_statistics{$type}{correct}); printf "%5.5s%8d%6d%9d%6d%6d%11.1f%6.1f%9.1f%6.1f%6.1f\n", $type, $name_detection_statistics{$type}{miss}, $name_detection_statistics{$type}{fa}, $name_detection_statistics{$type}{miss}, $name_detection_statistics{$type}{error}, $name_detection_statistics{$type}{correct}, 100*$name_detection_statistics{$type}{miss}/max($total,1), 100*$name_detection_statistics{$type}{fa}/max($total,1), 100*$name_detection_statistics{$type}{miss}/max($total,1), 100*$name_detection_statistics{$type}{error}/max($total,1), 100*$name_detection_statistics{$type}{correct}/max($total,1); } } ################################# sub score_mention_detection { my (%men_count, $type, $men_type, $rol_type, $sty_type, $err_type); my ($pn); #scoring conditioned on mention type undef %men_count; foreach $men_type (@mention_types) { foreach $rol_type (@entity_types) { foreach $sty_type (@entity_style_types) { foreach $err_type (@error_types) { my $nent = $mention_detection_statistics{$men_type}{$rol_type}{$sty_type}{$err_type}; $men_count{$men_type}{$err_type} += $nent ? $nent : 0; $men_count {total}{$err_type} += $nent ? $nent : 0; } } } } print "\nMention Detection and Extent Recognition statistics for \"$source_type\" sources (for mapped entities):\n", " ref ____________count______________ ____________percent____________\n", " mention Detection Extent_Recognition Detection Extent_Recognition\n", " type miss fa miss err corr miss fa miss err corr\n"; foreach $type (@mention_types, "total") { $pn = 100/max($epsilon, $men_count{$type}{miss}+$men_count{$type}{error}+$men_count{$type}{correct}); printf "%5.5s%8d%6d%9d%6d%6d%11.1f%6.1f%9.1f%6.1f%6.1f\n", $type, $men_count{$type}{miss}, $men_count{$type}{fa}, $men_count{$type}{miss}, $men_count{$type}{error}, $men_count{$type}{correct}, $pn*$men_count{$type}{miss}, min(999.9,$pn*$men_count{$type}{fa}), $pn*$men_count{$type}{miss}, $pn*$men_count{$type}{error}, $pn*$men_count{$type}{correct}; } #scoring conditioned on mention style undef %men_count; foreach $men_type (@mention_types) { foreach $rol_type (@entity_types) { foreach $sty_type (@entity_style_types) { foreach $err_type (@error_types) { my $nent = $mention_detection_statistics{$men_type}{$rol_type}{$sty_type}{$err_type}; $men_count{$sty_type}{$err_type} += $nent ? $nent : 0; $men_count {total}{$err_type} += $nent ? $nent : 0; } } } } print "\nMention Detection and Extent Recognition statistics for \"$source_type\" sources (for mapped entities):\n", " ref ____________count______________ ____________percent____________\n", " mention Detection Extent_Recognition Detection Extent_Recognition\n", " style miss fa miss err corr miss fa miss err corr\n"; foreach $type (@entity_style_types, "total") { $pn = 100/max($epsilon, $men_count{$type}{miss}+$men_count{$type}{error}+$men_count{$type}{correct}); printf "%5.5s%8d%6d%9d%6d%6d%11.1f%6.1f%9.1f%6.1f%6.1f\n", $type, $men_count{$type}{miss}, $men_count{$type}{fa}, $men_count{$type}{miss}, $men_count{$type}{error}, $men_count{$type}{correct}, $pn*$men_count{$type}{miss}, min(999.9,$pn*$men_count{$type}{fa}), $pn*$men_count{$type}{miss}, $pn*$men_count{$type}{error}, $pn*$men_count{$type}{correct}; } #scoring conditioned on mention role undef %men_count; foreach $men_type (@mention_types) { foreach $rol_type (@entity_types) { foreach $sty_type (@entity_style_types) { foreach $err_type (@error_types) { my $nent = $mention_detection_statistics{$men_type}{$rol_type}{$sty_type}{$err_type}; $men_count{$rol_type}{$err_type} += $nent ? $nent : 0; $men_count {total}{$err_type} += $nent ? $nent : 0; } } } } print "\nMention Detection and Extent Recognition statistics for \"$source_type\" sources (for mapped entities):\n", " ref ____________count______________ ____________percent____________\n", " mention Detection Extent_Recognition Detection Extent_Recognition\n", " role miss fa miss err corr miss fa miss err corr\n"; foreach $type (@entity_types, "total") { $pn = 100/max($epsilon, $men_count{$type}{miss}+$men_count{$type}{error}+$men_count{$type}{correct}); printf "%5.5s%8d%6d%9d%6d%6d%11.1f%6.1f%9.1f%6.1f%6.1f\n", $type, $men_count{$type}{miss}, $men_count{$type}{fa}, $men_count{$type}{miss}, $men_count{$type}{error}, $men_count{$type}{correct}, $pn*$men_count{$type}{miss}, min(999.9,$pn*$men_count{$type}{fa}), $pn*$men_count{$type}{miss}, $pn*$men_count{$type}{error}, $pn*$men_count{$type}{correct}; } } ################################# sub score_mention_attribute_recognition { # role attributes my ($role, $ref_role, $ent_type); my (%mention_role_count, %mention_role_total); print "\nMention Role confusion matrix for \"$source_type\" sources (for mapped mentions)\n", " For all mapped mentions:\n", " ___________count___________ __________percent__________\n", " ref\\tst: "; foreach $role (@entity_types) { printf " %3.3s ", $role; } print " "; foreach $role (@entity_types) { printf " %3.3s", $role; } print "\n"; foreach $ref_role (@entity_types) { foreach $role (@entity_types) { foreach $ent_type (@entity_types) { $mention_role_statistics{$ent_type}{$ref_role}{$role} = 0 unless defined $mention_role_statistics{$ent_type}{$ref_role}{$role}; $mention_role_count{$ref_role}{$role} += $mention_role_statistics{$ent_type}{$ref_role}{$role}; } $mention_role_total{$ref_role} += $mention_role_count{$ref_role}{$role}; } printf "%5.5s ", $ref_role; foreach $role (@entity_types) { printf "%6d", $mention_role_count{$ref_role}{$role}; } print " "; foreach $role (@entity_types) { printf "%6.1f", 100*$mention_role_count{$ref_role}{$role} / max($mention_role_total{$ref_role},1); } print "\n"; } foreach $ent_type (@entity_types) { print " For mapped mentions whose entity is of type \"$ent_type\":\n", " ___________count___________ __________percent__________\n", " ref\\tst: "; foreach $role (@entity_types) { printf " %3.3s ", $role; } print " "; foreach $role (@entity_types) { printf " %3.3s", $role; } print "\n"; foreach $ref_role (@entity_types) { $mention_role_total{$ref_role} = 0; foreach $role (@entity_types) { $mention_role_total{$ref_role} += $mention_role_statistics{$ent_type}{$ref_role}{$role}; } printf "%5.5s ", $ref_role; foreach $role (@entity_types) { printf "%6d", $mention_role_statistics{$ent_type}{$ref_role}{$role}; } print " "; foreach $role (@entity_types) { printf "%6.1f", 100*$mention_role_statistics{$ent_type}{$ref_role}{$role} / max($mention_role_total{$ref_role},1); } print "\n"; } } # style attributes my ($style, $ref_style); my (%mention_style_total); foreach $ref_style (@entity_style_types) { foreach $style (@entity_style_types) { $mention_style_statistics{$ref_style}{$style} = 0 unless defined $mention_style_statistics{$ref_style}{$style}; $mention_style_total{$ref_style} += $mention_style_statistics{$ref_style}{$style}; } } print "\nMention Style confusion matrix for \"$source_type\" sources (for mapped mentions):\n", " __count__ _percent_\n", " ref\\tst: "; foreach $style (@entity_style_types) { printf " %3.3s ", $style; } print " "; foreach $style (@entity_style_types) { printf " %3.3s", $style; } print "\n"; foreach $ref_style (@entity_style_types) { printf "%5.5s ", $ref_style; foreach $style (@entity_style_types) { printf "%6d", $mention_style_statistics{$ref_style}{$style}; } print " "; foreach $style (@entity_style_types) { printf "%6.1f", 100*$mention_style_statistics{$ref_style}{$style} / max($mention_style_total{$ref_style},1); } print "\n"; } } ################################# sub score_relation_attribute_recognition { # type attributes my ($type, $ref_type); my (%relation_type_total); foreach $ref_type (@relation_types) { foreach $type (@relation_types) { $relation_type_statistics{$ref_type}{$type} = 0 unless defined $relation_type_statistics{$ref_type}{$type}; $relation_type_total{$ref_type} += $relation_type_statistics{$ref_type}{$type}; } } print "\nRelation Type confusion matrix for \"$source_type\" sources (for mapped relations):\n" ." COUNT", "." x (6*(@relation_types-1)), " PERCENT", "." x (6*(@relation_types-1)), "\n ref\\tst:"; foreach $type (@relation_types) { printf " %5.5s", $type; } print " "; foreach $type (@relation_types) { printf " %5.5s", $type; } print "\n"; foreach $ref_type (@relation_types) { printf "%11.11s ", $ref_type; foreach $type (@relation_types) { printf "%6d", $relation_type_statistics{$ref_type}{$type}; } print " "; foreach $type (@relation_types) { printf "%6.1f", 100*$relation_type_statistics{$ref_type}{$type} / max($relation_type_total{$ref_type},1); } print "\n"; } # subtype attributes my ($subtype); foreach $ref_type (@relation_types) { my (@relation_subtypes, %relation_subtype_total); @relation_subtypes = sort keys %{$relation_subtypes{$ref_type}}; foreach $subtype (@relation_subtypes) { foreach $type (@relation_subtypes) { $relation_subtype_statistics{$ref_type}{$subtype}{$type} = 0 unless defined $relation_subtype_statistics{$ref_type}{$subtype}{$type}; $relation_subtype_total{$subtype} += $relation_subtype_statistics{$ref_type}{$subtype}{$type}; } } printf "\nRelation Subtype confusion matrix for \"$source_type\" sources (for mapped relations):\n" ." type=%-5.5s", $ref_type; print " COUNT", "." x (6*(@relation_subtypes-1)), " PERCENT", "." x (6*(@relation_subtypes-1)), "\n ref\\tst: "; foreach $subtype (@relation_subtypes) { printf "%5.5s ", $subtype; } print " "; foreach $subtype (@relation_subtypes) { printf "%5.5s ", $subtype; } print "\n"; foreach $subtype (@relation_subtypes) { printf "%11.11s ", $subtype; foreach $type (@relation_subtypes) { printf "%6d", $relation_subtype_statistics{$ref_type}{$subtype}{$type}; } print " "; foreach $type (@relation_subtypes) { printf "%6.1f", 100*$relation_subtype_statistics{$ref_type}{$subtype}{$type} / max($relation_subtype_total{$subtype},1); } print "\n"; } } # class attributes my ($class, $ref_class); my (%relation_class_total); foreach $ref_class (@relation_class_types) { foreach $class (@relation_class_types) { $relation_class_statistics{$ref_class}{$class} = 0 unless defined $relation_class_statistics{$ref_class}{$class}; $relation_class_total{$ref_class} += $relation_class_statistics{$ref_class}{$class}; } } print "\nRelation Class confusion matrix for \"$source_type\" sources (for mapped relations):\n" ." COUNT", "." x (6*(@relation_class_types-1)), " PERCENT", "." x (6*(@relation_class_types-1)), "\n ref\\tst:"; foreach $class (@relation_class_types) { printf " %5.5s", $class; } print " "; foreach $class (@relation_class_types) { printf " %5.5s", $class; } print "\n"; foreach $ref_class (@relation_class_types) { printf "%11.11s ", $ref_class; foreach $class (@relation_class_types) { printf "%6d", $relation_class_statistics{$ref_class}{$class}; } print " "; foreach $class (@relation_class_types) { printf "%6.1f", 100*$relation_class_statistics{$ref_class}{$class} / max($relation_class_total{$ref_class},1); } print "\n"; } } ################################# sub score_event_attribute_recognition { # type attributes my ($type, $ref_type); my (%event_type_total); foreach $ref_type (@event_types) { foreach $type (@event_types) { $event_type_statistics{$ref_type}{$type} = 0 unless defined $event_type_statistics{$ref_type}{$type}; $event_type_total{$ref_type} += $event_type_statistics{$ref_type}{$type}; } } print "\nEvent Type confusion matrix for \"$source_type\" sources (for mapped events):\n" ." COUNT", "." x (6*(@event_types-1)), " PERCENT", "." x (6*(@event_types-1)), "\n ref\\tst:"; foreach $type (@event_types) { printf " %5.5s", $type; } print " "; foreach $type (@event_types) { printf " %5.5s", $type; } print "\n"; foreach $ref_type (@event_types) { printf "%11.11s ", $ref_type; foreach $type (@event_types) { printf "%6d", $event_type_statistics{$ref_type}{$type}; } print " "; foreach $type (@event_types) { printf "%6.1f", 100*$event_type_statistics{$ref_type}{$type} / max($event_type_total{$ref_type},1); } print "\n"; } # modality attributes my ($modality, $ref_modality); my (%event_modality_total); foreach $ref_modality (@event_modalities) { foreach $modality (@event_modalities) { $event_modality_statistics{$ref_modality}{$modality} = 0 unless defined $event_modality_statistics{$ref_modality}{$modality}; $event_modality_total{$ref_modality} += $event_modality_statistics{$ref_modality}{$modality}; } } print "\nEvent Modality confusion matrix for \"$source_type\" sources (for mapped events):\n" ." COUNT", "." x (6*(@event_modalities-1)), " PERCENT", "." x (6*(@event_modalities-1)), "\n ref\\tst:"; foreach $modality (@event_modalities) { printf " %5.5s", $modality; } print " "; foreach $modality (@event_modalities) { printf " %5.5s", $modality; } print "\n"; foreach $ref_modality (@event_modalities) { printf "%11.11s ", $ref_modality; foreach $modality (@event_modalities) { printf "%6d", $event_modality_statistics{$ref_modality}{$modality}; } print " "; foreach $modality (@event_modalities) { printf "%6.1f", 100*$event_modality_statistics{$ref_modality}{$modality} / max($event_modality_total{$ref_modality},1); } print "\n"; } # participant role attributes my ($role, $ref_role); my (%participant_role_total); my @participant_roles = (@event_participant_roles, ""); foreach $ref_role (@participant_roles) { foreach $role (@participant_roles) { $participant_role_statistics{$ref_role}{$role} = 0 unless defined $participant_role_statistics{$ref_role}{$role}; $participant_role_total{$ref_role} += $participant_role_statistics{$ref_role}{$role}; } } print "\nEvent Role confusion matrix for \"$source_type\" sources (for mapped events):\n" ." COUNT", "." x (6*(@participant_roles-1)), " PERCENT", "." x (6*(@participant_roles-1)), "\n ref\\tst:"; foreach $role (@participant_roles) { printf " %5.5s", $role; } print " "; foreach $role (@participant_roles) { printf " %5.5s", $role; } print "\n"; foreach $ref_role (@participant_roles) { printf "%11.11s ", $ref_role; foreach $role (@participant_roles) { printf "%6d", $participant_role_statistics{$ref_role}{$role}; } print " "; foreach $role (@participant_roles) { printf "%6.1f", 100*$participant_role_statistics{$ref_role}{$role} / max($participant_role_total{$ref_role},1); } print "\n"; } } ################################# sub entity_value_type { my ($value) = @_; my ($value_type); foreach my $type (@entity_value_types) { my $upper_value = $value_type = $type; $upper_value =~ s/.*[^0-9\.]+//; return $value_type if $value < $upper_value; } return $value_type; } ################################# sub entity_count_type { my ($entity) = @_; my ($count, $count_type); #determine the number of mentions of this entity in the document $count = @{$entity->{mentions}}; foreach my $type (@entity_count_types) { my $upper_count = $count_type = $type; $upper_count =~ s/.*[^0-9\.]+//; return $count_type if $count <= $upper_count; } return $count_type; } ################################# sub relation_count_type { my ($relation) = @_; my ($count, $count_type); #determine the number of mentions of this relation in the document $count = @{$relation->{mentions}}; foreach my $type (@relation_count_types) { my $upper_count = $count_type = $type; $upper_count =~ s/.*[^0-9\.]+//; return $count_type if $count <= $upper_count; } return $count_type; } ################################# sub event_count_type { my ($event) = @_; my ($count, $count_type); #determine the number of mentions of this relation in the document $count = @{$event->{mentions}}; foreach my $type (@event_count_types) { my $upper_count = $count_type = $type; $upper_count =~ s/.*[^0-9\.]+//; return $count_type if $count <= $upper_count; } return $count_type; } ################################# sub crossdoc_type { my ($entity) = @_; my ($count, $count_type); #determine the number of documents in which this entity is mentioned $count = scalar keys %{$entity->{documents}}; foreach my $type (@xdoc_types) { my $upper_count = $count_type = $type; $upper_count =~ s/.*[^0-9\.]+//; return $count_type if $count <= $upper_count; } return $count_type; } ################################# sub conditional_EDT_statistics { my ($cond1, $cond2) = @_; my ($ref_entity, $tst_entity, $ref_occ, $tst_occ, $ref_id, $tst_id, $doc); my ($ref_mentions, $tst_mentions, $ref_mention, $tst_mention, $mention_type); my ($ref_names, $tst_names, $ref_name, $tst_name); my ($entity_type, $entity_subtype, $entity_value, $count_type, $entity_class); my ($src_type, $entity_origin, $xdoc_type); my ($role_type, $style_type); my ($norm_cost, $cost, $err_type); my (%error_count, %cumulative_cost, %normalizing_cost); my ($c1_value, $c2_value, $value); #initialize output statistics undef %entity_type_statistics; undef %entity_subtype_statistics; undef %entity_class_statistics; undef %name_detection_statistics; undef %mention_role_statistics; undef %mention_style_statistics; undef %mention_detection_statistics; my @c1_types = $cond1 eq "ENTITY TYPE" ? @entity_types : $cond1 eq "ENTITY VALUE" ? @entity_value_types : $cond1 eq "COUNT TYPE" ? @entity_count_types : $cond1 eq "SOURCE TYPE" ? keys %source_types : $cond1 eq "ENTITY ORIGIN" ? @origins : $cond1 eq "ENTITY CLASS" ? @entity_classes : $cond1 eq "CROSS-DOC TYPE" ? @xdoc_types : undef; die "\n\nFATAL ERROR in conditional_EDT_statistics: '$cond1' is not a recognized condition\n\n" unless @c1_types; if ($cond2) { my @c2_types = $cond2 eq "ENTITY TYPE" ? @entity_types : $cond2 eq "ENTITY VALUE" ? @entity_value_types : $cond2 eq "COUNT TYPE" ? @entity_count_types : $cond2 eq "SOURCE TYPE" ? keys %source_types : $cond2 eq "ENTITY ORIGIN" ? @origins : $cond2 eq "ENTITY CLASS" ? @entity_classes : $cond2 eq "CROSS-DOC TYPE" ? @xdoc_types : undef; die "\n\nFATAL ERROR in conditional_EDT_statistics: '$cond2' is not a recognized condition\n\n" unless @c2_types; foreach $c2_value (@c2_types, "ALL") { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c2_value}{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c2_value}{$c1_value}{$err_type} = 0; $cumulative_cost{$c2_value}{$c1_value}{$err_type} = 0; } } } } else { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c1_value}{$err_type} = 0; $cumulative_cost{$c1_value}{$err_type} = 0; } } } #accumulate recognition statistics over all entity occurrences in all documents foreach $ref_id (keys %{$ref_database{entities}}) { $ref_entity = $ref_database{entities}{$ref_id}; $entity_type = $ref_entity->{TYPE}; $entity_subtype = $ref_entity->{SUBTYPE}; $entity_class = $ref_entity->{CLASS}; $entity_origin = $ref_entity->{ORIGIN}; $xdoc_type = crossdoc_type ($ref_entity); foreach $doc (keys %{$ref_entity->{documents}}) { next unless exists $eval_docs{$doc}; $ref_occ = $ref_entity->{documents}{$doc}; $src_type = $ref_occ->{SOURCE}; $entity_value = entity_value_type($ref_occ->{VALUE}); $count_type = entity_count_type ($ref_occ); if ($tst_occ = $ref_occ->{MAP}) { #update entity attribute recognition statistics $entity_type_statistics{$entity_type}{$tst_occ->{TYPE}}++; $entity_subtype_statistics{$entity_subtype}{$tst_occ->{TYPE}}++; $entity_class_statistics{$entity_class}{$tst_occ->{CLASS}}++; #update entity name statistics $ref_names = $ref_occ->{names}; foreach $ref_name (@$ref_names) { $err_type = "miss"; $tst_name = $ref_name->{MAP}; $err_type = $tst_name ? (extent_mismatch($ref_name->{locator}, $tst_name->{locator}) <= 1 ? "correct" : "error") : "miss"; $name_detection_statistics{$entity_type}{$err_type}++; } $tst_names = $tst_occ->{names}; foreach $tst_name (@$tst_names) { $name_detection_statistics{$entity_type}{fa}++ unless $tst_name->{MAP}; } #update entity mention statistics $ref_mentions = $ref_occ->{mentions}; $tst_mentions = $tst_occ->{mentions}; foreach $ref_mention (@$ref_mentions) { $mention_type = $ref_mention->{TYPE}; $role_type = $ref_mention->{ROLE}; $style_type = $ref_mention->{STYLE}; $err_type = "miss"; if ($ref_mention->{MAP}) { $mention_role_statistics{$entity_type}{$role_type}{$ref_mention->{MAP}{ROLE}}++; $mention_style_statistics{$style_type}{$ref_mention->{MAP}{STYLE}}++; $err_type = extent_mismatch ($ref_mention->{extent}{locator}, $ref_mention->{MAP}->{extent}{locator}) <= 1 ? "correct" : "error"; } $mention_detection_statistics{$mention_type}{$role_type}{$style_type}{$err_type}++; } foreach $tst_mention (@$tst_mentions) { next if $tst_mention->{MAP}; $mention_type = $tst_mention->{TYPE}; $role_type = $tst_mention->{ROLE}; $style_type = $tst_mention->{STYLE}; $mention_detection_statistics{$mention_type}{$role_type}{$style_type}{fa}++; } } #update entity detection statistics $cost = $norm_cost = $ref_occ->{VALUE}; $cost -= $mapped_entity_document_value{$doc}{$ref_occ->{ID}}{$tst_occ->{ID}} if $tst_occ; $err_type = $tst_occ ? (($entity_type eq $tst_occ->{TYPE} and $entity_subtype eq $tst_occ->{SUBTYPE}) ? "correct" : "error") : "miss"; $c1_value = $cond1 eq "ENTITY TYPE" ? $entity_type : $cond1 eq "ENTITY VALUE" ? $entity_value : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ENTITY ORIGIN" ? $entity_origin : $cond1 eq "ENTITY CLASS" ? $entity_class : $cond1 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; if ($cond2) { $c2_value = $cond2 eq "ENTITY TYPE" ? $entity_type : $cond2 eq "ENTITY VALUE" ? $entity_value : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ENTITY ORIGIN" ? $entity_origin : $cond2 eq "ENTITY CLASS" ? $entity_class : $cond2 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } } } #update entity false alarm statistics foreach $tst_id (keys %{$tst_database{entities}}) { $tst_entity = $tst_database{entities}{$tst_id}; $entity_type = $tst_entity->{TYPE}; $entity_class = $tst_entity->{CLASS}; $entity_origin = $tst_entity->{ORIGIN}; $xdoc_type = crossdoc_type ($tst_entity); foreach $doc (keys %{$tst_entity->{documents}}) { next unless exists $eval_docs{$doc}; $tst_occ = $tst_entity->{documents}{$doc}; next if $tst_occ->{MAP}; $src_type = $tst_occ->{SOURCE}; $entity_value = entity_value_type($tst_occ->{VALUE}); $count_type = entity_count_type ($tst_occ); $norm_cost = 0; $cost = -$tst_occ->{FA_VALUE}; $err_type = "fa"; $c1_value = $cond1 eq "ENTITY TYPE" ? $entity_type : $cond1 eq "ENTITY VALUE" ? $entity_value : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ENTITY ORIGIN" ? $entity_origin : $cond1 eq "ENTITY CLASS" ? $entity_class : $cond1 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; if ($cond2) { $c2_value = $cond2 eq "ENTITY TYPE" ? $entity_type : $cond2 eq "ENTITY VALUE" ? $entity_value : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ENTITY ORIGIN" ? $entity_origin : $cond2 eq "ENTITY CLASS" ? $entity_class : $cond2 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } } } return ({%error_count}, {%cumulative_cost}, {%normalizing_cost}); } ################################# sub conditional_entity_link_statistics { my ($cond1, $cond2) = @_; my ($ref_entity, $tst_entity, $ref_occ, $tst_occ, $ref_id, $tst_id, $doc); my ($entity_type, $entity_value, $count_type, $entity_class); my ($src_type, $entity_origin, $xdoc_type); my ($norm_cost, $cost, $err_type); my (%error_count, %cumulative_cost, %normalizing_cost); my ($c1_value, $c2_value, $value); #initialize output statistics my @c1_types = $cond1 eq "ENTITY TYPE" ? @entity_types : $cond1 eq "ENTITY VALUE" ? @entity_value_types : $cond1 eq "COUNT TYPE" ? @entity_count_types : $cond1 eq "SOURCE TYPE" ? keys %source_types : $cond1 eq "ENTITY ORIGIN" ? @origins : $cond1 eq "ENTITY CLASS" ? @entity_classes : $cond1 eq "CROSS-DOC TYPE" ? @xdoc_types : undef; die "\n\nFATAL ERROR in conditional_entity_link_statistics: '$cond1' is not a recognized condition\n\n" unless @c1_types; if ($cond2) { my @c2_types = $cond2 eq "ENTITY TYPE" ? @entity_types : $cond2 eq "ENTITY VALUE" ? @entity_value_types : $cond2 eq "COUNT TYPE" ? @entity_count_types : $cond2 eq "SOURCE TYPE" ? keys %source_types : $cond2 eq "ENTITY ORIGIN" ? @origins : $cond2 eq "ENTITY CLASS" ? @entity_classes : $cond2 eq "CROSS-DOC TYPE" ? @xdoc_types : undef; die "\n\nFATAL ERROR in conditional_entity_link_statistics: '$cond2' is not a recognized condition\n\n" unless @c2_types; foreach $c2_value (@c2_types, "ALL") { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c2_value}{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c2_value}{$c1_value}{$err_type} = 0; $cumulative_cost{$c2_value}{$c1_value}{$err_type} = 0; } } } } else { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c1_value}{$err_type} = 0; $cumulative_cost{$c1_value}{$err_type} = 0; } } } #accumulate recognition statistics over all entity occurrences in all documents foreach $ref_id (keys %{$ref_database{entities}}) { $ref_entity = $ref_database{entities}{$ref_id}; $entity_type = $ref_entity->{TYPE}; $entity_class = $ref_entity->{CLASS}; $entity_origin = $ref_entity->{ORIGIN}; $tst_entity = $ref_entity->{MAP}; $xdoc_type = crossdoc_type ($ref_entity); foreach $doc (keys %{$ref_entity->{documents}}) { next unless exists $eval_docs{$doc}; $ref_occ = $ref_entity->{documents}{$doc}; $src_type = $ref_occ->{SOURCE}; $entity_value = entity_value_type($ref_occ->{VALUE}); $count_type = entity_count_type ($ref_occ); #update entity detection statistics $tst_occ = $ref_occ->{MAP}; $cost = $norm_cost = $ref_occ->{VALUE}; $cost -= $mapped_entity_document_value{$doc}{$ref_occ->{ID}}{$tst_occ->{ID}} if $tst_occ; $err_type = $tst_occ ? ((($ref_entity->{ORIGIN} eq "DATABASE" and $tst_entity->{ID} eq $ref_id) or ($ref_entity->{ORIGIN} eq "CORPUS" and $tst_entity->{ORIGIN} eq "CORPUS")) ? "correct" : "error") : "miss"; $c1_value = $cond1 eq "ENTITY TYPE" ? $entity_type : $cond1 eq "ENTITY VALUE" ? $entity_value : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ENTITY ORIGIN" ? $entity_origin : $cond1 eq "ENTITY CLASS" ? $entity_class : $cond1 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; if ($cond2) { $c2_value = $cond2 eq "ENTITY TYPE" ? $entity_type : $cond2 eq "ENTITY VALUE" ? $entity_value : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ENTITY ORIGIN" ? $entity_origin : $cond2 eq "ENTITY CLASS" ? $entity_class : $cond2 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } } } #update entity false alarm statistics foreach $tst_id (keys %{$tst_database{entities}}) { $tst_entity = $tst_database{entities}{$tst_id}; $entity_type = $tst_entity->{TYPE}; $entity_class = $tst_entity->{CLASS}; $entity_origin = $tst_entity->{ORIGIN}; $xdoc_type = crossdoc_type ($tst_entity); foreach $doc (keys %{$tst_entity->{documents}}) { next unless exists $eval_docs{$doc}; $tst_occ = $tst_entity->{documents}{$doc}; $src_type = $tst_occ->{SOURCE}; next if $tst_occ->{MAP}; $entity_value = entity_value_type($tst_occ->{VALUE}); $count_type = entity_count_type ($tst_occ); $err_type = "fa"; $cost = -$tst_occ->{FA_VALUE}; $norm_cost = 0; $c1_value = $cond1 eq "ENTITY TYPE" ? $entity_type : $cond1 eq "ENTITY VALUE" ? $entity_value : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ENTITY ORIGIN" ? $entity_origin : $cond1 eq "ENTITY CLASS" ? $entity_class : $cond1 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; if ($cond2) { $c2_value = $cond2 eq "ENTITY TYPE" ? $entity_type : $cond2 eq "ENTITY VALUE" ? $entity_value : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ENTITY ORIGIN" ? $entity_origin : $cond2 eq "ENTITY CLASS" ? $entity_class : $cond2 eq "CROSS-DOC TYPE" ? $xdoc_type : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } } } return ({%error_count}, {%cumulative_cost}, {%normalizing_cost}); } ################################# sub conditional_RDC_statistics { my ($cond1, $cond2) = @_; my ($ref_relation, $tst_relation, $ref_occ, $tst_occ, $ref_id, $tst_id, $doc); my ($ref_mentions, $tst_mentions, $ref_mention, $tst_mention); my ($relation_type, $relation_subtype, $count_type, $relation_class, $relation_origin); my ($src_type, $xdoc_type, $type_err, $subt_err, $time_err); my ($narg_errs, $arg_errs); my ($norm_value, $cost, $err_type); my (%error_count, %cumulative_cost, %normalizing_cost); my ($c1_value, $c2_value); #initialize output statistics undef %relation_type_statistics; undef %relation_subtype_statistics; undef %relation_class_statistics; my @c1_types = $cond1 eq "RELATION TYPE" ? @relation_types : $cond1 eq "RELATION CLASS" ? @relation_class_types : $cond1 eq "COUNT TYPE" ? @relation_count_types : $cond1 eq "SOURCE TYPE" ? keys %source_types : $cond1 eq "ARGUMENT ERRORS" ? @relation_argument_errors : undef; die "\n\nFATAL ERROR in conditional_RDC_statistics: '$cond1' is not a recognized condition\n\n" unless @c1_types; if ($cond2) { my @c2_types = $cond2 eq "RELATION TYPE" ? @relation_types : $cond2 eq "RELATION CLASS" ? @relation_class_types : $cond2 eq "COUNT TYPE" ? @relation_count_types : $cond2 eq "SOURCE TYPE" ? keys %source_types : $cond2 eq "ARGUMENT ERRORS" ? @relation_argument_errors : undef; die "\n\nFATAL ERROR in conditional_RDC_statistics: '$cond2' is not a recognized condition\n\n" unless @c2_types; foreach $c2_value (@c2_types, "ALL") { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c2_value}{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c2_value}{$c1_value}{$err_type} = 0; $cumulative_cost{$c2_value}{$c1_value}{$err_type} = 0; } } } } else { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c1_value}{$err_type} = 0; $cumulative_cost{$c1_value}{$err_type} = 0; } } } #accumulate recognition statistics over all relation occurrences in all documents foreach $ref_id (keys %{$ref_database{relations}}) { $ref_relation = $ref_database{relations}{$ref_id}; $relation_type = $ref_relation->{TYPE}; $relation_subtype = $ref_relation->{SUBTYPE}; $relation_class = $ref_relation->{CLASS}; $relation_origin = $ref_relation->{ORIGIN}; $xdoc_type = crossdoc_type ($ref_relation); foreach $doc (keys %{$ref_relation->{documents}}) { next unless exists $eval_docs{$doc}; $ref_occ = $ref_relation->{documents}{$doc}; $src_type = $ref_occ->{SOURCE}; $count_type = relation_count_type ($ref_occ); if ($tst_occ=$ref_occ->{MAP}) { #update relation attribute recognition statistics $relation_type_statistics{$relation_type}{$tst_occ->{TYPE}}++; $relation_subtype_statistics{$relation_type}{$relation_subtype}{$tst_occ->{SUBTYPE}}++; $relation_class_statistics{$relation_class}{$tst_occ->{CLASS}}++; #update relation detection statistics $cost = $norm_value = $ref_occ->{VALUE}; $cost -= $mapped_relation_document_value{$doc}{$ref_occ->{ID}}{$tst_occ->{ID}}; $type_err = $relation_type ne $tst_occ->{TYPE}; $subt_err = $relation_subtype ne $tst_occ->{SUBTYPE}; $time_err = (num_relation_time_mismatches ($ref_relation->{times}, $ref_relation->{MAP}{times}) == 0) ? "correct" : "error"; $narg_errs = num_relation_argument_mapping_errors ($ref_relation, $ref_relation->{MAP}); $arg_errs = $narg_errs > 1 ? ">1" : $narg_errs; $err_type = ($arg_errs or $type_err eq "error" or $subt_err eq "error") ? "error" : "correct"; } else { $arg_errs = 0; $err_type = "miss"; $cost = $norm_value = $ref_occ->{VALUE}; } $c1_value = $cond1 eq "RELATION TYPE" ? $relation_type : $cond1 eq "RELATION CLASS" ? $relation_class : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ARGUMENT ERRORS" ? $arg_errs : undef; if ($cond2) { $c2_value = $cond2 eq "RELATION TYPE" ? $relation_type : $cond2 eq "RELATION CLASS" ? $relation_class : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ARGUMENT ERRORS" ? $arg_errs : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_value; $normalizing_cost{$c2_value} {total} += $norm_value; $normalizing_cost {ALL} {$c1_value} += $norm_value; $normalizing_cost {ALL} {total} += $norm_value; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_value; $normalizing_cost {total} += $norm_value; } } } #update relation false alarm statistics foreach $tst_id (keys %{$tst_database{relations}}) { $tst_relation = $tst_database{relations}{$tst_id}; $relation_type = $tst_relation->{TYPE}; $relation_subtype = $tst_relation->{SUBTYPE}; $relation_class = $tst_relation->{CLASS}; $relation_origin = $tst_relation->{ORIGIN}; $xdoc_type = crossdoc_type ($tst_relation); foreach $doc (keys %{$tst_relation->{documents}}) { next unless exists $eval_docs{$doc}; $tst_occ = $tst_relation->{documents}{$doc}; next if $tst_occ->{MAP}; $src_type = $tst_occ->{SOURCE}; $count_type = relation_count_type ($tst_occ); $arg_errs = 0; $norm_value = 0; $cost = $relation_fa_wgt * $tst_occ->{VALUE}; $err_type = "fa"; $c1_value = $cond1 eq "RELATION TYPE" ? $relation_type : $cond1 eq "RELATION CLASS" ? $relation_class : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "ARGUMENT ERRORS" ? $arg_errs : undef; if ($cond2) { $c2_value = $cond2 eq "RELATION TYPE" ? $relation_type : $cond2 eq "RELATION CLASS" ? $relation_class : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "ARGUMENT ERRORS" ? $arg_errs : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_value; $normalizing_cost{$c2_value} {total} += $norm_value; $normalizing_cost {ALL} {$c1_value} += $norm_value; $normalizing_cost {ALL} {total} += $norm_value; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_value; $normalizing_cost {total} += $norm_value; } } } return ({%error_count}, {%cumulative_cost}, {%normalizing_cost}); } ################################# sub num_relation_argument_mapping_errors { my ($ref, $tst) = @_; my ($score, $arg_order, $narg, $ref_arg, $tst_arg, $nmapped, $num_errors); ($score, $arg_order) = relation_document_value ($ref, $tst); $nmapped = 0; for ($narg=0; $narg<@{$ref->{arguments}}; $narg++) { $ref_arg = $ref->{arguments}[$narg]; $tst_arg = $tst->{arguments}[$arg_order->[$narg]]; $nmapped++ if ($ref_database{entities}{$ref_arg->{ID}}{MAP} and $tst_arg->{ID} and $ref_database{entities}{$ref_arg->{ID}}{MAP}{ID} eq $tst_arg->{ID}); } $num_errors = @{$ref->{arguments}} + @{$tst->{arguments}} - 2*$nmapped; return $num_errors; } ################################# sub num_event_participant_mapping_errors { my ($ref_event) = @_; my ($num_errs, $role, $id, $tst_event, $ref_entity, $ref_participant); $num_errs = num_event_participants ($ref_event); $tst_event = $ref_event->{MAP}; return $num_errs unless $tst_event; $num_errs += num_event_participants ($tst_event); foreach $role (sort keys %{$ref_event->{participants}}) { foreach $id (sort keys %{$ref_event->{participants}{$role}}) { $ref_entity = $ref_database{entities}{$id}; $ref_participant = $ref_event->{participants}{$role}{$id}; $num_errs -= 2 if ($ref_participant->{MAP} and $ref_entity->{MAP} and $ref_entity->{MAP}{ID} eq $ref_participant->{MAP}{ID}); } } return $num_errs; } ################################# sub conditional_EDC_statistics { my ($cond1, $cond2) = @_; my ($ref_event, $tst_event, $ref_occ, $tst_occ, $ref_id, $tst_id, $doc); my ($ref_mentions, $tst_mentions, $ref_mention, $tst_mention); my ($event_type, $count_type, $event_modality, $event_origin); my ($src_type, $xdoc_type, $type_err, $modality_err); my ($nerrs, $participant_errs, $ref_role, $tst_role, $tst_participant); my ($norm_value, $cost, $err_type); my (%error_count, %cumulative_cost, %normalizing_cost); my ($c1_value, $c2_value); #initialize output statistics undef %event_type_statistics; undef %event_modality_statistics; undef %participant_role_statistics; my @c1_types = $cond1 eq "EVENT TYPE" ? @event_types : $cond1 eq "EVENT MODALITY" ? @event_modalities : $cond1 eq "COUNT TYPE" ? @event_count_types : $cond1 eq "SOURCE TYPE" ? keys %source_types : $cond1 eq "PARTICIPANT ERRORS" ? @event_participant_errors : undef; die "\n\nFATAL ERROR in conditional_EDC_statistics: '$cond1' is not a recognized condition\n\n" unless @c1_types; if ($cond2) { my @c2_types = $cond2 eq "EVENT TYPE" ? @event_types : $cond2 eq "EVENT MODALITY" ? @event_modalities : $cond2 eq "COUNT TYPE" ? @event_count_types : $cond2 eq "SOURCE TYPE" ? keys %source_types : $cond2 eq "PARTICIPANT ERRORS" ? @event_participant_errors : undef; die "\n\nFATAL ERROR in conditional_EDC_statistics: '$cond2' is not a recognized condition\n\n" unless @c2_types; foreach $c2_value (@c2_types, "ALL") { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c2_value}{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c2_value}{$c1_value}{$err_type} = 0; $cumulative_cost{$c2_value}{$c1_value}{$err_type} = 0; } } } } else { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c1_value}{$err_type} = 0; $cumulative_cost{$c1_value}{$err_type} = 0; } } } #accumulate recognition statistics over all event occurrences in all documents foreach $ref_id (keys %{$ref_database{events}}) { $ref_event = $ref_database{events}{$ref_id}; $event_type = $ref_event->{TYPE}; $event_modality = $ref_event->{MODALITY}; $event_origin = $ref_event->{ORIGIN}; $xdoc_type = crossdoc_type ($ref_event); foreach $doc (keys %{$ref_event->{documents}}) { next unless exists $eval_docs{$doc}; $ref_occ = $ref_event->{documents}{$doc}; $src_type = $ref_occ->{SOURCE}; $count_type = event_count_type ($ref_occ); if ($tst_occ=$ref_occ->{MAP}) { #update event attribute recognition statistics $event_type_statistics{$event_type}{$ref_occ->{MAP}{TYPE}}++; $event_modality_statistics{$event_modality}{$ref_occ->{MAP}{MODALITY}}++; foreach $ref_role (keys %{$ref_occ->{participants}}) { foreach $ref_id (keys %{$ref_occ->{participants}{$ref_role}}) { $tst_participant = $ref_occ->{participants}{$ref_role}{$ref_id}->{MAP}; $tst_role = $tst_participant ? $tst_participant->{ROLE} : ""; $participant_role_statistics{$ref_role}{$tst_role}++; } } foreach $tst_role (keys %{$tst_occ->{participants}}) { foreach $tst_id (keys %{$tst_occ->{participants}{$tst_role}}) { next if $tst_occ->{participants}{$tst_role}{$tst_id}->{MAP}; $participant_role_statistics{""}{$tst_role}++; } } #update event detection statistics $cost = $norm_value = $ref_occ->{VALUE}; $cost -= $mapped_event_document_value{$doc}{$ref_occ->{ID}}{$tst_occ->{ID}}; $type_err = $event_type ne $tst_occ->{TYPE}; $modality_err = $event_modality ne $tst_occ->{MODALITY}; $nerrs = num_event_participant_mapping_errors ($ref_event, $tst_event); $participant_errs = $nerrs > 1 ? ">1" : $nerrs; $err_type = ($participant_errs or $type_err or $modality_err) ? "error" : "correct"; } else { $participant_errs = 0; $err_type = "miss"; $cost = $norm_value = $ref_occ->{VALUE}; } $c1_value = $cond1 eq "EVENT TYPE" ? $event_type : $cond1 eq "EVENT MODALITY" ? $event_modality : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "PARTICIPANT ERRORS" ? $participant_errs : undef; if ($cond2) { $c2_value = $cond2 eq "EVENT TYPE" ? $event_type : $cond2 eq "EVENT MODALITY" ? $event_modality : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "PARTICIPANT ERRORS" ? $participant_errs : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_value; $normalizing_cost{$c2_value} {total} += $norm_value; $normalizing_cost {ALL} {$c1_value} += $norm_value; $normalizing_cost {ALL} {total} += $norm_value; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_value; $normalizing_cost {total} += $norm_value; } } } #update event false alarm statistics foreach $tst_id (keys %{$tst_database{events}}) { $tst_event = $tst_database{events}{$tst_id}; $event_type = $tst_event->{TYPE}; $event_modality = $tst_event->{MODALITY}; $event_origin = $tst_event->{ORIGIN}; $xdoc_type = crossdoc_type ($tst_event); foreach $doc (keys %{$tst_event->{documents}}) { next unless exists $eval_docs{$doc}; $tst_occ = $tst_event->{documents}{$doc}; next if $tst_occ->{MAP}; $src_type = $tst_occ->{SOURCE}; $count_type = event_count_type ($tst_occ); $participant_errs = 0; $norm_value = 0; $cost = $event_fa_wgt * $tst_occ->{VALUE}; $err_type = "fa"; $c1_value = $cond1 eq "EVENT TYPE" ? $event_type : $cond1 eq "EVENT MODALITY" ? $event_modality : $cond1 eq "COUNT TYPE" ? $count_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "PARTICIPANT ERRORS" ? $participant_errs : undef; if ($cond2) { $c2_value = $cond2 eq "EVENT TYPE" ? $event_type : $cond2 eq "EVENT MODALITY" ? $event_modality : $cond2 eq "COUNT TYPE" ? $count_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "PARTICIPANT ERRORS" ? $participant_errs : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_value; $normalizing_cost{$c2_value} {total} += $norm_value; $normalizing_cost {ALL} {$c1_value} += $norm_value; $normalizing_cost {ALL} {total} += $norm_value; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_value; $normalizing_cost {total} += $norm_value; } } } return ({%error_count}, {%cumulative_cost}, {%normalizing_cost}); } ################################# sub conditional_TIMEX2_statistics { my ($cond1, $cond2) = @_; my ($ref_timex2, $tst_timex2, $ref_occ, $tst_occ, $ref_id, $tst_id, $doc); my ($ref_mentions, $tst_mentions, $ref_mention, $tst_mention); my ($timex2_type, $timex2_value, $count_type, $timex2_class); my ($src_type, $timex2_origin); my ($norm_cost, $cost, $err_type); my (%error_count, %cumulative_cost, %normalizing_cost); my ($c1_value, $c2_value, $value); #initialize output statistics undef %attribute_confusion_statistics; undef %mention_detection_statistics; my @c1_types = $cond1 eq "SOURCE TYPE" ? keys %source_types : $cond1 eq "TIMEX2 ORIGIN" ? @origins : undef; die "\n\nFATAL ERROR in conditional_TIMEX2_statistics: '$cond1' is not a recognized condition\n\n" unless @c1_types; if ($cond2) { my @c2_types = $cond2 eq "SOURCE TYPE" ? keys %source_types : $cond2 eq "TIMEX2 ORIGIN" ? @origins : undef; die "\n\nFATAL ERROR in conditional_TIMEX2_statistics: '$cond2' is not a recognized condition\n\n" unless @c2_types; foreach $c2_value (@c2_types, "ALL") { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c2_value}{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c2_value}{$c1_value}{$err_type} = 0; $cumulative_cost{$c2_value}{$c1_value}{$err_type} = 0; } } } } else { foreach $c1_value (@c1_types, "total") { $normalizing_cost{$c1_value} = 0; foreach $err_type (@error_types) { $error_count{$c1_value}{$err_type} = 0; $cumulative_cost{$c1_value}{$err_type} = 0; } } } #accumulate recognition statistics over all timex2 occurrences in all documents foreach $ref_id (keys %{$ref_database{timex2s}}) { $ref_timex2 = $ref_database{timex2s}{$ref_id}; $timex2_type = $ref_timex2->{TYPE}; $timex2_origin = $ref_timex2->{ORIGIN}; #update timex2 attribute recognition statistics $tst_timex2 = $ref_timex2->{MAP}; my $attribute_errors; if ($tst_timex2) { foreach my $attribute (keys %timex2_attribute_wgt) { my $ref_att = $ref_timex2->{attributes}{$attribute}; $ref_att = "" unless defined $ref_att; my $tst_att = $tst_timex2->{attributes}{$attribute} if $tst_timex2; $tst_att = "" unless defined $tst_att; $attribute_confusion_statistics{$attribute}{$ref_att}{$tst_att}++; $attribute_errors++ if $ref_att ne $tst_att; } } #update document-level statistics foreach $doc (keys %{$ref_timex2->{documents}}) { next unless exists $eval_docs{$doc}; $ref_occ = $ref_timex2->{documents}{$doc}; $src_type = $ref_occ->{SOURCE}; $cost = $norm_cost = $ref_occ->{VALUE}; $tst_occ = $ref_occ->{MAP}; $cost -= $mapped_timex2_document_value{$doc}{$ref_occ->{ID}}{$tst_occ->{ID}} if $tst_occ; #update timex2 detection statistics $err_type = $tst_occ ? ($attribute_errors ? "error" : "correct") : "miss"; $c1_value = $cond1 eq "TIMEX2 TYPE" ? $timex2_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "TIMEX2 ORIGIN" ? $timex2_origin : undef; if ($cond2) { $c2_value = $cond2 eq "TIMEX2 TYPE" ? $timex2_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "TIMEX2 ORIGIN" ? $timex2_origin : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } #update timex2 mention statistics if ($tst_occ) { $ref_mentions = $ref_occ->{mentions}; $tst_mentions = $tst_occ->{mentions}; foreach $ref_mention (@$ref_mentions) { $err_type = "miss"; if ($ref_mention->{MAP}) { $err_type = extent_mismatch ($ref_mention->{extent}{locator}, $ref_mention->{MAP}->{extent}{locator}) <= $epsilon ? "correct" : "error"; } $mention_detection_statistics{$err_type}++; } foreach $tst_mention (@$tst_mentions) { next if $tst_mention->{MAP}; $mention_detection_statistics{fa}++; } } } } #update timex2 false alarm statistics foreach $tst_id (keys %{$tst_database{timex2s}}) { $tst_timex2 = $tst_database{timex2s}{$tst_id}; $timex2_type = $tst_timex2->{TYPE}; $timex2_origin = $tst_timex2->{ORIGIN}; foreach $doc (keys %{$tst_timex2->{documents}}) { next unless exists $eval_docs{$doc}; $tst_occ = $tst_timex2->{documents}{$doc}; next if $tst_occ->{MAP}; $src_type = $tst_occ->{SOURCE}; $norm_cost = 0; $cost = -$tst_occ->{FA_VALUE}; $err_type = "fa"; $c1_value = $cond1 eq "TIMEX2 TYPE" ? $timex2_type : $cond1 eq "SOURCE TYPE" ? $src_type : $cond1 eq "TIMEX2 ORIGIN" ? $timex2_origin : undef; if ($cond2) { $c2_value = $cond2 eq "TIMEX2 TYPE" ? $timex2_type : $cond2 eq "SOURCE TYPE" ? $src_type : $cond2 eq "TIMEX2 ORIGIN" ? $timex2_origin : undef; $error_count{$c2_value}{$c1_value}{$err_type}++; $error_count{$c2_value} {total} {$err_type}++; $error_count {ALL} {$c1_value}{$err_type}++; $error_count {ALL} {total} {$err_type}++; $cumulative_cost{$c2_value}{$c1_value}{$err_type} += $cost; $cumulative_cost{$c2_value} {total} {$err_type} += $cost; $cumulative_cost {ALL} {$c1_value}{$err_type} += $cost; $cumulative_cost {ALL} {total} {$err_type} += $cost; $normalizing_cost{$c2_value}{$c1_value} += $norm_cost; $normalizing_cost{$c2_value} {total} += $norm_cost; $normalizing_cost {ALL} {$c1_value} += $norm_cost; $normalizing_cost {ALL} {total} += $norm_cost; } else { $error_count{$c1_value}{$err_type}++; $error_count {total} {$err_type}++; $cumulative_cost{$c1_value}{$err_type} += $cost; $cumulative_cost {total} {$err_type} += $cost; $normalizing_cost{$c1_value} += $norm_cost; $normalizing_cost {total} += $norm_cost; } } } return ({%error_count}, {%cumulative_cost}, {%normalizing_cost}); } ################################# sub print_entity_mapping { my ($ref_db, $tst_db, $label) = @_; my ($ref_entity, $tst_entity, $entity, $ref_id, $tst_id, $doc, $ref_occ, $tst_occ, $output); print "\n======== $label ========\n\n"; foreach $ref_id (sort keys %{$ref_db->{entities}}) { my $print_data = $print_all_data; $output = "--------\n"; $ref_entity = $ref_db->{entities}{$ref_id}; if ($tst_entity = $ref_entity->{MAP}) { $tst_id = $tst_entity->{ID}; my $err_type = $ref_entity->{TYPE} eq $tst_entity->{TYPE} ? undef : "TYPE"; $err_type .= defined $err_type ? "/CLASS" : "CLASS" if ($ref_entity->{CLASS} ne $tst_entity->{CLASS}); my $link_err = $ref_database{entities}{$ref_id}{ORIGIN} ne $tst_database{entities}{$tst_id}{ORIGIN}; $print_data ||= $print_err_data if ($link_err or $err_type); $output .= (($link_err or $err_type) ? ">>> " : " "); $output .= sprintf "ref entity $ref_id (%3.3s/%3.3s)\n", $ref_entity->{TYPE}, $ref_entity->{CLASS}; $output .= (($link_err or $err_type) ? ">>> " : " "); $output .= sprintf "tst entity $tst_id (%3.3s/%3.3s)", $tst_entity->{TYPE}, $tst_entity->{CLASS}; $output .= $err_type ? " -- ENTITY $err_type MISMATCH" : ""; $output .= $link_err ? " -- database LINK ERROR\n" : "\n"; $output .= sprintf (" entity score:%9.5f out of%9.5f\n", $mapped_entity_value{$ref_id}{$tst_id}, $ref_entity->{VALUE}); } else { $print_data ||= $print_err_data; $output .= sprintf (">>> ref entity $ref_id (%3.3s/%3.3s) -- NO MATCHING TST ENTITY\n", $ref_entity->{TYPE}, $ref_entity->{CLASS}); $output .= sprintf " entity score: 0.00000 out of%9.5f\n", $ref_entity->{VALUE}; } foreach $doc (keys %{$ref_entity->{documents}}) { next unless defined $eval_docs{$doc}; $ref_occ = $ref_entity->{documents}{$doc}; print_entity_mapping_details ($ref_occ, $ref_occ->{MAP}, $doc, $print_data, $output); $output = ""; } } foreach $tst_id (sort keys %{$tst_db->{entities}}) { $tst_entity = $tst_db->{entities}{$tst_id}; next if $tst_entity->{MAP}; my $print_data = ($print_all_data or $print_err_data); $output = "--------\n" unless $tst_entity->{MAP}; $output .= sprintf (">>> tst entity $tst_id (%3.3s/%3.3s) -- NO MATCHING REF ENTITY\n", $tst_entity->{TYPE}, $tst_entity->{CLASS}); $output .= sprintf " entity score:%9.5f out of 0.00000\n", $tst_entity->{FA_VALUE}; foreach $doc (keys %{$tst_entity->{documents}}) { next unless defined $eval_docs{$doc}; $tst_occ = $tst_entity->{documents}{$doc}; next if $tst_occ->{MAP}; print_entity_mapping_details (undef, $tst_occ, $doc, $print_data, $output); $output = ""; } } } ################################# sub print_entity_mapping_details { my ($ref_entity, $tst_entity, $doc, $print_data, $output) = @_; my ($ref_mentions, $tst_mentions, $ref_mention, $tst_mention, $ref_names, $tst_names, $ref_name, $tst_name); my ($error_type, $link_err, $text); my ($entity, $type, $mention, @mentions, $name, @names); $output .= "- in document $doc:\n"; if ($ref_entity) { foreach $mention (@{$ref_entity->{mentions}}) { push @mentions, {DATA=>$mention, TYPE=>"REF"}; } foreach $name (@{$ref_entity->{names}}) { push @names, {DATA=>$name, TYPE=>"REF"}; } } if ($tst_entity) { foreach $mention (@{$tst_entity->{mentions}}) { push @mentions, {DATA=>$mention, TYPE=>"TST"}; } foreach $name (@{$tst_entity->{names}}) { push @names, {DATA=>$name, TYPE=>"TST"}; } } if ($ref_entity and $tst_entity) { foreach $mention (sort sort_on_locator @mentions) { $type = $mention->{TYPE}; $mention = $mention->{DATA}; next if $type eq "TST" and $mention->{MAP}; if ($mention->{MAP}) { $ref_mention = $mention; $tst_mention = $mention->{MAP}; $error_type = $ref_mention->{TYPE} eq $tst_mention->{TYPE} ? undef : "TYPE"; $error_type .= defined $error_type ? "/ROLE" : "ROLE" if ($ref_mention->{ROLE} ne $tst_mention->{ROLE}); $error_type .= defined $error_type ? "/STYLE" : "STYLE" if ($ref_mention->{STYLE} ne $tst_mention->{STYLE}); if ($error_type) { $print_data ||= $print_err_data; $output .= ">>> ref mention = \"" . (defined $ref_mention->{head}{text} ? $ref_mention->{head}{text} : "???") . "\""; $output .= sprintf " (%3.3s/%3.3s/%3.3s) -- MENTION $error_type MISMATCH (%3.3s/%3.3s/%3.3s)\n", $ref_mention->{TYPE}, $ref_mention->{ROLE}, $ref_mention->{STYLE}, $tst_mention->{TYPE}, $tst_mention->{ROLE}, $tst_mention->{STYLE}; } else { $output .= " ref mention = \"" . (defined $ref_mention->{head}{text} ? $ref_mention->{head}{text} : "???") . "\""; $output .= sprintf " (%3.3s/%3.3s/%3.3s)\n", $ref_mention->{TYPE}, $ref_mention->{ROLE}, $ref_mention->{STYLE}; } if (extent_mismatch ($ref_mention->{extent}{locator}, $tst_mention->{extent}{locator}) > 1) { $print_data ||= $print_err_data; $text = defined $ref_mention->{extent}{text} ? $ref_mention->{extent}{text} : "???"; $output .= ">>> ref mention extent = \"$text\"\n"; $text = defined $tst_mention->{extent}{text} ? $tst_mention->{extent}{text} : "???"; $output .= ">>> tst mention extent = \"$text\" -- MENTION EXTENT MISMATCH\n"; } } else { $print_data ||= $print_err_data; $output .= ">>> ".(lc$type)." mention = \"" . (defined $mention->{head}{text} ? $mention->{head}{text} : "???") . "\""; $output .= sprintf " (%3.3s/%3.3s/%3.3s) -- NO MATCHING %s MENTION\n", $mention->{TYPE}, $mention->{ROLE}, $mention->{STYLE}, ($type eq "REF"?"TST":"REF"); } } foreach $name (sort sort_on_locator @names) { $type = $name->{TYPE}; $name = $name->{DATA}; next if $type eq "TST" and $name->{MAP}; if ($name->{MAP}) { $ref_name = $name; $tst_name = $name->{MAP}; if (extent_mismatch($ref_name->{locator}, $tst_name->{locator}) <= 1) { $output .= " ref name = \"" . (defined $ref_name->{text} ? $ref_name->{text} : "???") . "\"\n"; } else { $print_data ||= $print_err_data; $text = defined $ref_name->{text} ? $ref_name->{text} : "???"; $output .= ">>> ref name extent = \"$text\"\n"; $text = defined $tst_name->{text} ? $tst_name->{text} : "???"; $output .= ">>> tst name extent = \"$text\" -- NAME EXTENT MISMATCH\n"; } } else { $print_data ||= $print_err_data; $output .= ">>> ".(lc$type)." name = \"" . (defined $name->{text} ? $name->{text} : "???") . "\""; $output .= " -- NO MATCHING ".($type eq "REF"?"TST":"REF")." NAME\n"; } } } else { $print_data ||= $print_err_data; foreach $mention (sort sort_on_locator @mentions) { $type = $mention->{TYPE}; $mention = $mention->{DATA}; $output .= " ".(lc$type)." mention = \"" . (defined $mention->{head}{text} ? $mention->{head}{text} : "???") . "\""; $output .= sprintf " (%3.3s/%3.3s/%3.3s)\n", $mention->{TYPE}, $mention->{ROLE}, $mention->{STYLE}; } foreach $name (sort sort_on_locator @names) { $type = $name->{TYPE}; $name = $name->{DATA}; $output .= " ".(lc$type)." name = \"" . (defined $name->{text} ? $name->{text} : "???") . "\""; $output .= " -- NO MATCHING ".($type eq "REF"?"TST":"REF")." NAME\n"; } } print $output if $print_data; } ################################# sub print_relation_mapping { my ($ref_db, $tst_db, $label) = @_; my ($ref_relation, $tst_relation, $ref_id, $tst_id); my ($print_data, $error_type, $pre_err, $post_err, $arg, $narg, $entity); my ($type_mismatch, $subtype_mismatch, $class_mismatch, $time_mismatch); my ($e_ref, $e_tst, $a_ref, $a_tst, $arg_err); print "\n======== $label ========\n\n"; foreach $ref_id (sort keys %{$ref_db->{relations}}) { $print_data = $print_all_data; $ref_relation = $ref_db->{relations}{$ref_id}; if ($tst_relation = $ref_relation->{MAP}) { $tst_id = $tst_relation->{ID}; undef $error_type; $type_mismatch = $ref_relation->{TYPE} ne $tst_relation->{TYPE}; $subtype_mismatch = $ref_relation->{SUBTYPE} ne $tst_relation->{SUBTYPE}; $class_mismatch = $ref_relation->{CLASS} ne $tst_relation->{CLASS}; $time_mismatch = num_relation_time_mismatches ($ref_relation->{times}, $tst_relation->{times}) > 0; $error_type = "TYPE" if $type_mismatch; $error_type .= defined $error_type ? "/SUBTYPE" : "SUBTYPE" if $subtype_mismatch; $error_type .= defined $error_type ? "/CLASS" : "CLASS" if $class_mismatch; $error_type .= defined $error_type ? "/TIME" : "TIME" if $time_mismatch; $print_data ||= $print_err_data if $error_type; next unless $print_data; #print mapped relations print "--------\n"; printf " relation score:%9.5f out of%9.5f\n", $mapped_relation_value{$ref_id}{$tst_id}, $ref_relation->{VALUE}; $pre_err = $error_type ? ">>>" : " "; $post_err = $error_type ? " -- $error_type MISMATCH\n" : "\n"; print "$pre_err ref relation $ref_id ($ref_relation->{TYPE}/$ref_relation->{SUBTYPE}/$ref_relation->{CLASS})$post_err"; foreach $time (@{$ref_relation->{times}}) { $pre_err = num_relation_time_matches ([$time], $tst_relation->{times}) ? " " : ">>>"; printf "$pre_err time info:%s\n", relation_time_info($time); } $narg=0; foreach $arg (@{$ref_relation->{arguments}}) { $e_ref = $ref_db->{entities}{$arg->{ID}}; $a_ref = $ref_relation->{arguments}[$narg]; $arg_err = ((not $a_ref->{MAP}) ? "NO CORRESPONDING TST ARGUMENT" : ((not $e_ref->{MAP}) ? "UNMAPPED ARGUMENT" : (($e_ref->{MAP}{ID} ne $a_ref->{MAP}{ID}) ? "MISMAPPED ARGUMENT" : ""))); print $arg_err ? ">>> ":" ", relation_arg_description (++$narg, $e_ref, $arg_err); } print_relation_mentions ($ref_relation->{documents}); $pre_err = $error_type ? ">>>" : " "; print "$pre_err tst relation $tst_id ($tst_relation->{TYPE}/$tst_relation->{SUBTYPE}/$tst_relation->{CLASS})$post_err"; foreach $time (@{$tst_relation->{times}}) { $pre_err = num_relation_time_matches ([$time], $ref_relation->{times}) ? " " : ">>>"; printf "$pre_err time info:%s\n", relation_time_info($time); } $narg=0; foreach $arg (@{$tst_relation->{arguments}}) { $e_tst = $tst_db->{entities}{$arg->{ID}}; $a_tst = $tst_relation->{arguments}[$narg]; $arg_err = ((not $a_tst->{MAP}) ? "NO CORRESPONDING REF ARGUMENT" : ((not $e_tst->{MAP}) ? "UNMAPPED ARGUMENT" : (($e_tst->{MAP}{ID} ne $a_tst->{MAP}{ID}) ? "MISMAPPED ARGUMENT": ""))); print $arg_err ? ">>> ":" ", relation_arg_description (++$narg, $e_tst, $arg_err); } print_relation_mentions ($tst_relation->{documents}); } #print unmapped reference relations else { $print_data ||= $print_err_data; { next unless $print_data; print "--------\n"; printf " relation score: 0.00000 out of%9.5f\n", $ref_relation->{VALUE}; $pre_err = ">>>"; $post_err = " -- NO MATCHING TST RELATION\n"; print "$pre_err ref relation $ref_id ($ref_relation->{TYPE}/$ref_relation->{SUBTYPE}/$ref_relation->{CLASS})$post_err"; foreach $time (@{$ref_relation->{times}}) { printf "$ time info:%s\n", relation_time_info($time); } $narg=0; foreach $arg (@{$ref_relation->{arguments}}) { $entity = $ref_db->{entities}{$arg->{ID}}; print " ".relation_arg_description (++$narg, $entity); } print_relation_mentions ($ref_relation->{documents}); } } } #print unmapped test relations $print_data ||= $print_err_data; return unless $print_data; foreach $tst_id (sort keys %{$tst_db->{relations}}) { $tst_relation = $tst_db->{relations}{$tst_id}; next if $tst_relation->{MAP}; print "--------\n"; printf " relation score:%9.5f out of 0.00000\n", -$tst_relation->{VALUE}; $pre_err = ">>>"; $post_err = " -- NO MATCHING REF RELATION\n"; print "$pre_err tst relation $tst_id ($tst_relation->{TYPE}/$tst_relation->{SUBTYPE}/$tst_relation->{CLASS})$post_err"; foreach $time (@{$tst_relation->{times}}) { printf " time info:%s\n", relation_time_info($time); } $narg=0; foreach $arg (@{$tst_relation->{arguments}}) { $entity = $tst_db->{entities}{$arg->{ID}}; print " ".relation_arg_description (++$narg, $entity); } print_relation_mentions ($tst_relation->{documents}); } } ################################# sub num_relation_time_mismatches { my ($t1, $t2) = @_; my $num_mismatches=0; $num_mismatches += @{$t1} if defined $t1; $num_mismatches += @{$t2} if defined $t2; $num_mismatches -= 2*num_relation_time_matches($t1, $t2); return $num_mismatches; } ################################# sub num_relation_time_matches { my ($times1, $times2) = @_; return 0 unless $times1 and $times2; my @times1 = @{$times1}; my @times2 = @{$times2}; my $num_matches=0; foreach my $time1 (@times1) { foreach my $time2 (@times2) { $num_matches++ if relation_time_match ($time1, $time2); # my $match = 1; # foreach my $type (@info_type) { # next if not $time1->{$type} and not @time2->{$type}; # $match = 0 if not $time1->{$type} or not $time2->{$type} or $time1->{$type} ne $time2->{$type}; # } # $num_matches += $match; } } return $num_matches; } ################################# sub relation_time_match { my ($t1, $t2) = @_; my @info_type = ("TYPE", "VAL", "MOD", "DIR"); foreach my $type (@info_type) { next if not $t1->{$type} and not $t2->{$type}; return "" if not $t1->{$type} or not $t2->{$type} or $t1->{$type} ne $t2->{$type}; } return 1; } ################################# sub relation_time_info { my ($time) = @_; my $info = ""; return "" unless $time; foreach my $type (@relation_time_attributes, "text") { my $value = $time->{$type} ? $time->{$type} : ""; next unless $value; $info .= "," if $info; $info .= " $type=\"$value\""; } return $info; } ################################# sub relation_arg_description { my ($narg, $entity, $error) = @_; my $out = sprintf "arg %d (%3.3s/%3.3s): ID=%s", $narg, $entity->{TYPE}, $entity->{CLASS}, $entity->{ID}; if (my $name=longest_entity_name($entity)) { $out .= sprintf ", name=\"%s\"", limit_string($max_string_length, $name); } elsif (my $head=longest_entity_mention_head($entity)) { $out .= sprintf ", head=\"%s\"", limit_string($max_string_length, $head); } $out .= $error ? sprintf " -- $error\n" : "\n"; return $out; } ################################# sub print_relation_mentions { my ($docs) = @_; my ($doc, @mentions, $mention, $time, @times, $arg, $narg); foreach $doc (sort keys %{$docs}) { print " -- in document $doc\n"; @mentions = @{$docs->{$doc}{mentions}}; foreach $mention (@mentions) { print " mention ID=$mention->{ID}, lex_cond=$mention->{LC}"; @times = @{$mention->{times}}; printf ", time info:%s", relation_time_info($time) if ($time=pop @times); foreach $time (@times) { printf " & %s", relation_time_info($time); } print "\n"; $narg=0; foreach $arg (@{$mention->{arguments}}) { printf " arg %d: ID=$arg->{MENTIONID}, text=\"%s\"\n", ++$narg, (defined $arg->{extent}{text} ? $arg->{extent}{text} : "???"); } } } } ################################# sub compute_entity_values { my ($ref_id, $ref_entity, $tst_id, $tst_entity, $value); my ($doc, $doc_entity, $ref_occ, $tst_occ, %doc_refs, %doc_tsts); my $ref_entities = $ref_database{entities}; my $tst_entities = $tst_database{entities}; #prepare to deweight FA tst mentions that correspond to ref mentions tag_tst_mentions_that_correspond_to_ref_mentions ($ref_entities, $tst_entities) if $entity_mention_ref_fa_wgt != 1.0; #group document-level ref entities by document foreach $ref_id (keys %$ref_entities) { $ref_entity = $ref_entities->{$ref_id}; $ref_entity->{VALUE} = 0; foreach $doc (keys %{$ref_entity->{documents}}) { $ref_occ = $ref_entity->{documents}{$doc}; push @{$doc_refs{$doc}}, $ref_occ; } } #group document-level tst entities by document foreach $tst_id (keys %$tst_entities) { $tst_entity = $tst_entities->{$tst_id}; $tst_entity->{FA_VALUE} = 0; foreach $doc (keys %{$tst_entity->{documents}}) { $tst_occ = $tst_entity->{documents}{$doc}; push @{$doc_tsts{$doc}}, $tst_occ; } } undef %mapped_entity_document_value; undef %mapped_entity_value; foreach $doc (keys %doc_refs) { foreach $ref_occ (@{$doc_refs{$doc}}) { #compute ref entity values $ref_id = $ref_occ->{ID}; ($ref_occ->{VALUE}) = entity_document_value($ref_occ); $ref_entities->{$ref_id}{VALUE} += $ref_occ->{VALUE}; #compute ref-tst mapped entity values foreach $tst_occ (@{$doc_tsts{$doc}}) { ($value) = entity_document_value ($ref_occ, $tst_occ); next unless defined $value; $tst_id = $tst_occ->{ID}; $mapped_entity_document_value{$doc}{$ref_id}{$tst_id} = $value; $mapped_entity_value{$ref_id}{$tst_id} += $value; } } } #compute tst entity values foreach $doc (keys %doc_tsts) { foreach $tst_occ (@{$doc_tsts{$doc}}) { $tst_id = $tst_occ->{ID}; ($tst_occ->{VALUE}) = entity_document_value($tst_occ); $tst_entities->{$tst_id}{VALUE} += $tst_occ->{VALUE}; ($tst_occ->{FA_VALUE}) = entity_document_value(undef,$tst_occ); $tst_entities->{$tst_id}{FA_VALUE} += $tst_occ->{FA_VALUE}; } } } ################################# sub tag_tst_mentions_that_correspond_to_ref_mentions { my ($ref_entities, $tst_entities) = @_; my ($doc, %doc_entities); my ($ref_id, $ref_entity, $ref_occ, $ref_mention, $ref_locator); my ($tst_id, $tst_entity, $tst_occ, $tst_mention, $tst_locator); foreach $ref_id (keys %$ref_entities) { $ref_entity = $ref_entities->{$ref_id}; foreach $doc (keys %{$ref_entity->{documents}}) { $ref_occ = $ref_entity->{documents}{$doc}; push @{$doc_entities{$doc}{REF}}, $ref_occ; } } foreach $tst_id (keys %$tst_entities) { $tst_entity = $tst_entities->{$tst_id}; foreach $doc (keys %{$tst_entity->{documents}}) { $tst_occ = $tst_entity->{documents}{$doc}; push @{$doc_entities{$doc}{TST}}, $tst_occ; } } foreach $doc (keys %doc_entities) { foreach $ref_occ (@{$doc_entities{$doc}{REF}}) { foreach $tst_occ (@{$doc_entities{$doc}{TST}}) { foreach $ref_mention (@{$ref_occ->{mentions}}) { foreach $tst_mention (@{$tst_occ->{mentions}}) { next unless defined $ref_mention->{head} and ($ref_locator=$ref_mention->{head}{locator}); next unless defined $tst_mention->{head} and ($tst_locator=$tst_mention->{head}{locator}); next if span_overlap($ref_locator, $tst_locator) < $min_overlap; $tst_mention->{is_ref_mention} = 1; } } } } } } ################################# sub map_entities { my ($db, $ref_id, $tst_id, $ref_entity, $tst_entity); my (@ref_entities, @tst_entities, @ref_cohorts, @tst_cohorts); #collect entities foreach $ref_id (keys %{$ref_database{entities}}) { $ref_entity = $ref_database{entities}{$ref_id}; push @ref_entities, $ref_entity; } foreach $tst_id (keys %{$tst_database{entities}}) { $tst_entity = $tst_database{entities}{$tst_id}; push @tst_entities, $tst_entity; } #group entities into cohort sets and map each set independently foreach $ref_entity (@ref_entities) { next if exists $ref_entity->{cohort}; tag_cohorts ($ref_entity, \@ref_entities, \@tst_entities, \%mapped_entity_value, 1); @ref_cohorts = collect_cohorts (\@ref_entities); @tst_cohorts = collect_cohorts (\@tst_entities); map_cohorts (\@ref_cohorts, \@tst_cohorts, \%mapped_entity_value, $entity_fa_wgt); foreach $ref_entity (@ref_cohorts) { $tst_entity = $ref_entity->{MAP}; next unless $tst_entity; map_document_level_entity_mentions ($ref_entity, $tst_entity); } } } ################################# sub collect_cohorts { my ($entities) = @_; my ($entity, @cohorts); foreach $entity (@$entities) { next if $entity->{mapped}; next unless $entity->{cohort}; push @cohorts, $entity; $entity->{mapped} = 1; } return @cohorts; } ################################# sub tag_cohorts { my ($ent, $ent_set, $map_set, $mapped_values, $ent_set_is_ref) = @_; $ent->{cohort} = 1; foreach my $ent_map (@$map_set) { next if exists $ent_map->{cohort}; tag_cohorts ($ent_map, $map_set, $ent_set, $mapped_values, not $ent_set_is_ref) if $ent_set_is_ref ? defined $mapped_values->{$ent->{ID}}{$ent_map->{ID}} : defined $mapped_values->{$ent_map->{ID}}{$ent->{ID}}; } } ################################# sub map_cohorts { my ($ref_cohorts, $tst_cohorts, $mapped_values, $fa_wgt) = @_; my ($i, $j, $ref_id, $tst_id, %mapping_costs, $fa_value); #compute mapping costs for ($i=0; $i<@$ref_cohorts; $i++) { $ref_id = $ref_cohorts->[$i]{ID}; for ($j=0; $j<@$tst_cohorts; $j++) { $tst_id = $tst_cohorts->[$j]{ID}; next unless defined $mapped_values->{$ref_id}{$tst_id}; $fa_value = $fa_wgt*$tst_cohorts->[$j]{VALUE}; $fa_value *= $entity_mention_ref_fa_wgt if $tst_cohorts->[$j]{is_ref_mention}; $mapping_costs{$i}{$j} -= $mapped_values->{$ref_id}{$tst_id} + $fa_value; } } my $map = weighted_bipartite_graph_matching(\%mapping_costs) or die "\n\nFATAL ERROR: Cohort mapping through BGM FAILED\n\n"; foreach $i (keys %$map) { $j = $map->{$i}; $ref_cohorts->[$i]{MAP} = $tst_cohorts->[$j]; $tst_cohorts->[$j]{MAP} = $ref_cohorts->[$i]; } } ################################# sub map_document_level_entity_mentions { my ($ref_entity, $tst_entity) = @_; my ($ref_id, $doc, $ref_occ, $tst_occ); foreach $doc (keys %{$ref_entity->{documents}}) { $ref_occ = $ref_entity->{documents}{$doc}; $tst_occ = $tst_entity->{documents}{$doc}; next unless $tst_occ; $ref_occ->{MAP} = $tst_occ; $tst_occ->{MAP} = $ref_occ; #map mentions (my $value, my $map) = entity_document_value($ref_occ, $tst_occ); my $ref_mentions = $ref_occ->{mentions}; my $tst_mentions = $tst_occ->{mentions}; foreach my $i (keys %$map) { my $j = $map->{$i}; $ref_mentions->[$i]{MAP} = $tst_mentions->[$j]; $tst_mentions->[$j]{MAP} = $ref_mentions->[$i]; } #map names my ($ref_names, $tst_names, $ref_name, $tst_name, $overlap, $max_overlap); $ref_names = $ref_occ->{names}; $tst_names = $ref_occ->{MAP}{names}; foreach $ref_name (@$ref_names) { $max_overlap = $min_overlap; foreach $tst_name (@$tst_names) { $overlap = span_overlap($ref_name->{locator}, $tst_name->{locator}); if ($overlap > $max_overlap) { $max_overlap = $overlap; $ref_name->{MAP} = $tst_name; } } $tst_name = $ref_name->{MAP}; $tst_name->{MAP} = $ref_name if $tst_name; } } } ################################# sub compute_relation_values { my ($ref_id, $ref_relation, $tst_id, $tst_relation, $value); my ($doc, $doc_relation, $ref_occ, $tst_occ, %doc_relations, %mapped_value_computed); my $ref_relations = $ref_database{relations}; my $tst_relations = $tst_database{relations}; #group ref relations by document foreach $ref_id (keys %$ref_relations) { $ref_relation = $ref_relations->{$ref_id}; foreach $doc (keys %{$ref_relation->{documents}}) { $doc_relations{$doc}{REF}{$ref_id} = $ref_relation; ($ref_relation->{documents}{$doc}{VALUE}) = relation_document_value ($ref_relation, undef, $doc); $ref_relation->{VALUE} += $ref_relation->{documents}{$doc}{VALUE}; } } #group tst relations by document foreach $tst_id (keys %$tst_relations) { $tst_relation = $tst_relations->{$tst_id}; foreach $doc (keys %{$tst_relation->{documents}}) { $doc_relations{$doc}{TST}{$tst_id} = $tst_relation; ($tst_relation->{documents}{$doc}{VALUE}) = relation_document_value (undef, $tst_relation, $doc); $tst_relation->{VALUE} += $tst_relation->{documents}{$doc}{VALUE}; } } #compute ref-tst mapped relation values undef %mapped_relation_value; foreach $doc (keys %doc_relations) { foreach $ref_id (keys %{$doc_relations{$doc}{REF}}) { foreach $tst_id (keys %{$doc_relations{$doc}{TST}}) { next if $mapped_value_computed{$ref_id}{$tst_id}; ($mapped_relation_value{$ref_id}{$tst_id}) = relation_document_value ($ref_relations->{$ref_id}, $tst_relations->{$tst_id}); $mapped_value_computed{$ref_id}{$tst_id} = 1; } } } } ################################# sub map_relations { my ($ref_id, $tst_id, $ref_relation, $tst_relation); my (@ref_relations, @tst_relations, @ref_cohorts, @tst_cohorts); #collect relations foreach $ref_id (keys %{$ref_database{relations}}) { $ref_relation = $ref_database{relations}{$ref_id}; push @ref_relations, $ref_relation; } foreach $tst_id (keys %{$tst_database{relations}}) { $tst_relation = $tst_database{relations}{$tst_id}; push @tst_relations, $tst_relation; } #group relations into cohort sets and map each set independently foreach $ref_relation (@ref_relations) { next if exists $ref_relation->{cohort}; tag_cohorts ($ref_relation, \@ref_relations, \@tst_relations, \%mapped_relation_value, 1); @ref_cohorts = collect_cohorts (\@ref_relations); @tst_cohorts = collect_cohorts (\@tst_relations); map_cohorts (\@ref_cohorts, \@tst_cohorts, \%mapped_relation_value, $relation_fa_wgt); foreach $ref_relation (@ref_cohorts) { $tst_relation = $ref_relation->{MAP}; next unless $tst_relation; map_document_level_relation_mentions ($ref_relation, $tst_relation); foreach my $doc (keys %{$ref_relation->{documents}}, keys %{$tst_relation->{documents}}) { next if defined $mapped_relation_document_value{$doc}{$ref_relation->{ID}}{$tst_relation->{ID}}; ($mapped_relation_document_value{$doc}{$ref_relation->{ID}}{$tst_relation->{ID}}) = relation_document_value ($ref_relation, $tst_relation, $doc); } } } } ################################# sub map_document_level_relation_mentions { my ($ref_relation, $tst_relation) = @_; my ($value, $arg_order, $narg, $ref_arg, $tst_arg, $doc, $ref_occ, $tst_occ); ($value, $arg_order) = relation_document_value ($ref_relation, $tst_relation); #map relation arguments for ($narg=0; $narg<@{$ref_relation->{arguments}}; $narg++) { $ref_arg = $ref_relation->{arguments}[$narg]; $tst_arg = $tst_relation->{arguments}[$arg_order->[$narg]]; $ref_arg->{MAP} = $tst_arg; $tst_arg->{MAP} = $ref_arg; } #map document level info foreach $doc (keys %{$ref_relation->{documents}}) { $ref_occ = $ref_relation->{documents}{$doc}; $tst_occ = $tst_relation->{documents}{$doc}; next unless $tst_occ; $ref_occ->{MAP} = $tst_occ; $tst_occ->{MAP} = $ref_occ; for ($narg=0; $narg<@{$ref_relation->{arguments}}; $narg++) { $ref_arg = $ref_occ->{arguments}[$narg]; $tst_arg = $tst_occ->{arguments}[$arg_order->[$narg]]; $ref_arg->{MAP} = $tst_arg; $tst_arg->{MAP} = $ref_arg; } } } ################################# sub entity_value { #N.B. The entity value must be undef if a match is not possible my ($ref_entity, $tst_entity) = @_; my ($doc_value, $total_value, $entity_value, $ref_doc); #entities must exist return undef unless $ref_entity; $tst_entity = $ref_entity unless $tst_entity; #accumulate value over all documents foreach $ref_doc (keys %{$ref_entity->{documents}}) { ($doc_value) = entity_document_value ($ref_entity->{documents}{$ref_doc}, $tst_entity->{documents}{$ref_doc}); $total_value += $doc_value if defined $doc_value; } return undef unless defined $total_value; return $total_value; } ################################# sub entity_document_value { my ($ref_doc_entity, $tst_doc_entity) = @_; my $fa_entity = not $ref_doc_entity; #calculate FA score if ref is null $ref_doc_entity = $tst_doc_entity unless $ref_doc_entity; $tst_doc_entity = $ref_doc_entity unless $tst_doc_entity; return undef unless $ref_doc_entity; my $ref_mentions = $ref_doc_entity->{mentions}; my $tst_mentions = $tst_doc_entity->{mentions}; my ($mention_value, $fa_score, %mapping_costs); for (my $i=0; $i<@$ref_mentions; $i++) { for (my $j=0; $j<@$tst_mentions; $j++) { $mention_value = entity_mention_score ($ref_mentions->[$i], $tst_mentions->[$j]); next unless defined $mention_value; $fa_score = $entity_mention_fa_wgt * $mention_type_wgt{$tst_mentions->[$j]{TYPE}}; $fa_score *= $entity_mention_ref_fa_wgt if $tst_mentions->[$j]{is_ref_mention}; $mapping_costs{$j}{$i} -= $mention_value + $fa_score; } } return undef unless %mapping_costs; #find optimum mapping of ref mentions to tst mentions my ($map) = weighted_bipartite_graph_matching(\%mapping_costs) or die "\n\nFATAL ERROR: Document level entity mention mapping through BGM FAILED\n\n"; undef $map if $fa_entity; #calculate FA score if ref is null my ($doc_mentions_value, $mentions_map); for (my $j=0; $j<@$tst_mentions; $j++) { my $i = $map->{$j}; if (defined $i) { $doc_mentions_value -= $mapping_costs{$j}{$i}; $mentions_map->{$i} = $j; } $fa_score = $entity_mention_fa_wgt * $mention_type_wgt{$tst_mentions->[$j]{TYPE}}; $fa_score *= $entity_mention_ref_fa_wgt if $tst_mentions->[$j]{is_ref_mention}; $doc_mentions_value -= $fa_score; } return undef unless defined $doc_mentions_value; my $entity_value = min($entity_type_wgt{$ref_doc_entity->{TYPE}}*$entity_class_wgt{$ref_doc_entity->{CLASS}}, $entity_type_wgt{$tst_doc_entity->{TYPE}}*$entity_class_wgt{$tst_doc_entity->{CLASS}}); $entity_value = max($entity_value,$epsilon); #reduce value for errors in entity attributes $entity_value *= $entity_err_wgt{TYPE} if $ref_doc_entity->{TYPE} ne $tst_doc_entity->{TYPE}; $entity_value *= $entity_err_wgt{SUBTYPE} if $ref_doc_entity->{SUBTYPE} ne $tst_doc_entity->{SUBTYPE}; $entity_value *= $entity_err_wgt{CLASS} if $ref_doc_entity->{CLASS} ne $tst_doc_entity->{CLASS}; return $entity_value*$doc_mentions_value, $mentions_map; } ################################# sub entity_mention_score { #N.B. The mention mapping score must be undef if tst doesn't match ref. my ($ref_mention, $tst_mention) = @_; my ($ref_locator, $tst_locator, $score); if (defined $ref_mention->{head} and ($ref_locator = $ref_mention->{head}{locator}) and defined $tst_mention->{head} and ($tst_locator = $tst_mention->{head}{locator})) { return undef if span_overlap($ref_locator, $tst_locator) < $min_overlap; $score = min($mention_type_wgt{$ref_mention->{TYPE}},$mention_type_wgt{$tst_mention->{TYPE}}); } elsif (defined $ref_mention->{extent} and ($ref_locator = $ref_mention->{extent}{locator}) and defined $tst_mention->{extent} and ($tst_locator = $tst_mention->{extent}{locator})) { return undef if span_overlap($ref_locator, $tst_locator) < $min_overlap; $score = $epsilon; } else { return undef; } #reduce value for errors in mention attributes $score *= $entity_mention_err_wgt{TYPE} if $ref_mention->{TYPE} ne $tst_mention->{TYPE}; $score *= $entity_mention_err_wgt{ROLE} if $ref_mention->{ROLE} ne $tst_mention->{ROLE}; $score *= $entity_mention_err_wgt{STYLE} if $ref_mention->{STYLE} ne $tst_mention->{STYLE}; return $score; } ################################# sub relation_document_value { my ($ref_relation, $tst_relation, $doc) = @_; my ($arg, @docs, $arg_order, $arg_score, $alt_score, @ref_args, @tst_args, @alt_args); my ($ref_db, $tst_db, $ref, $tst); ($tst_db, $tst) = $tst_relation ? (\%tst_database, $tst_relation) : (\%ref_database, $ref_relation); ($ref_db, $ref) = $ref_relation ? (\%ref_database, $ref_relation) : (\%tst_database, $tst_relation); foreach $arg (@{$ref->{arguments}}) { push @ref_args, $arg->{ID}; } foreach $arg (@{$tst->{arguments}}) { push @tst_args, $arg->{ID}; } @docs = defined $doc ? ($doc) : keys %{$ref->{documents}}; $arg_score = relation_arguments_score ($ref_db, \@ref_args, $tst_db, \@tst_args, \@docs); if (not $ref_relation or not $tst_relation) { return undef unless defined $arg_score; $arg_order = [0,1]; } elsif ($relation_symmetry{$ref->{TYPE}}{$ref->{SUBTYPE}}) { @alt_args = reverse @tst_args; $alt_score = relation_arguments_score ($ref_db, \@ref_args, $tst_db, \@alt_args, \@docs); if (defined $arg_score and ((not defined $alt_score) or $arg_score >= $alt_score)) { $arg_order = [0,1]; } elsif (defined $alt_score) { $arg_score = $alt_score; $arg_order = [1,0]; } else { return undef; } } elsif (not defined $arg_score) { return undef; } else { $arg_order = [0,1]; } my $relation_value = min($relation_type_wgt{$ref->{TYPE}}, $relation_type_wgt{$tst->{TYPE}}); #reduce value for errors in relation attributes $relation_value *= $relation_err_wgt{TYPE} if $ref->{TYPE} ne $tst->{TYPE}; $relation_value *= $relation_err_wgt{SUBTYPE} if $ref->{SUBTYPE} ne $tst->{SUBTYPE}; return $relation_value*$arg_score, $arg_order; } ################################# sub relation_arguments_score { my ($ref_db, $ref_args, $tst_db, $tst_args, $docs) = @_; my ($ref_id, $tst_id, $ref_entities, $tst_entities); $ref_entities = $ref_db->{entities}; $tst_entities = $tst_db->{entities}; my $self_score = $ref_db eq $tst_db; my $score; for (my $i=0; $i<@$ref_args; $i++) { $ref_id = $ref_args->[$i]; $tst_id = $tst_args->[$i]; my $min_value = $relation_argument_threshold * $ref_entities->{$ref_id}{VALUE}; my $match_ok = $self_score ? 1 : (defined $mapped_entity_value{$ref_id}{$tst_id} ? $mapped_entity_value{$ref_id}{$tst_id} > (1-$required_precision)*$min_value : undef); my $arg_score; foreach my $doc (@$docs) { if ($self_score) { $arg_score += $ref_entities->{$ref_id}{documents}{$doc}{VALUE} if defined $ref_entities->{$ref_id}{documents}{$doc}; } elsif ($match_ok and defined $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}) { $arg_score += $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}; } } # return undef unless $arg_score; $score += $arg_score if $arg_score; } return $score; } ################################# sub span_overlap { #This subroutine returns the minimum mutual overlap between two spans. my ($ref_locator, $tst_locator, $minmax) = @_; if ($ref_locator->{data_type} eq "text" and $tst_locator->{data_type} eq "text") { return text_span_overlap ($ref_locator, $tst_locator, $minmax); } elsif ($ref_locator->{data_type} eq "audio" and $tst_locator->{data_type} eq "audio") { return audio_span_overlap ($ref_locator, $tst_locator, $minmax); } elsif ($ref_locator->{data_type} eq "image" and $tst_locator->{data_type} eq "image") { return image_span_overlap ($ref_locator, $tst_locator, $minmax); } else { die "\n\nFATAL ERROR in span_overlap\n" ." unknown or nonexistent or incompatible ref/tst locator data types\n\n"; } } ################################# sub text_span_overlap { #This subroutine returns the minimum mutual overlap between two spans. my ($ref_locator, $tst_locator, $minmax) = @_; my ($ref_start, $ref_end, $tst_start, $tst_end, $overlap); $ref_start = $ref_locator->{start}; $ref_end = $ref_locator->{end}; $tst_start = $tst_locator->{start}; $tst_end = $tst_locator->{end}; $overlap = 1 + (min($ref_end,$tst_end) - max($ref_start,$tst_start)); $minmax = \&max unless defined $minmax; return $overlap <= 0 ? 0 : $overlap / &$minmax ($ref_end-$ref_start+1, $tst_end-$tst_start+1); } ################################# sub audio_span_overlap { #This subroutine returns the minimum mutual overlap between two spans. my ($ref_locator, $tst_locator, $minmax) = @_; my ($ref_start, $ref_end, $tst_start, $tst_end, $overlap); my ($ref_overlap, $tst_overlap); my $nominal_frame_period = 0.01; $ref_start = $ref_locator->{tstart}; $ref_end = $ref_start + $ref_locator->{tdur}; $tst_start = $tst_locator->{tstart}; $tst_end = $tst_start + $tst_locator->{tdur}; $overlap = $nominal_frame_period + (min($ref_end,$tst_end) - max($ref_start,$tst_start)); $minmax = \&max unless defined $minmax; return $overlap <= 0 ? 0 : $overlap / &$minmax ($ref_end-$ref_start+$nominal_frame_period, $tst_end-$tst_start+$nominal_frame_period); } ################################# sub image_span_overlap { #This subroutine returns the minimum mutual overlap between two image spans. my ($ref_locator, $tst_locator, $minmax) = @_; my $ref_boxes = $ref_locator->{bblist}; my $tst_boxes = $tst_locator->{bblist}; my ($ref_box, $tst_box); my $nominal_pixel_size = 1; # = 1 cm???!!! no -- current scale is pixels rather than cm my $ref_area = 0; foreach $ref_box (@$ref_boxes) { $ref_area += ($ref_box->{width} + $nominal_pixel_size) * ($ref_box->{height} + $nominal_pixel_size); } my $tst_area = 0; foreach $tst_box (@$tst_boxes) { $tst_area += ($tst_box->{width} + $nominal_pixel_size) * ($tst_box->{height} + $nominal_pixel_size); } my $mutual_area = 0; foreach $ref_box (@$ref_boxes) { foreach $tst_box (@$tst_boxes) { next if ($ref_box->{page} ne $tst_box->{page}); my $x1 = max($ref_box->{x_start}, $tst_box->{x_start}); my $x2 = min($ref_box->{x_start}+$ref_box->{width}, $tst_box->{x_start}+$tst_box->{width}); next if ($x1 > $x2); my $y1 = max($ref_box->{y_start}, $tst_box->{y_start}); my $y2 = min($ref_box->{y_start}+$ref_box->{height}, $tst_box->{y_start}+$tst_box->{height}); next if ($y1 > $y2); $mutual_area += (($x2 - $x1 + $nominal_pixel_size) * ($y2 - $y1 + $nominal_pixel_size)); } } #return the minimum mutual overlap $minmax = \&max unless defined $minmax; return $mutual_area / &$minmax ($ref_area,$tst_area); } ################################# sub extent_mismatch { #This subroutine returns the maximum mismatch in the #extent of two locators my ($ref_locator, $tst_locator) = @_; if ($ref_locator->{data_type} eq "text" and $ref_locator->{data_type} eq "text") { return text_extent_mismatch ($ref_locator, $tst_locator); } elsif ($ref_locator->{data_type} eq "audio" and $ref_locator->{data_type} eq "audio") { return audio_extent_mismatch ($ref_locator, $tst_locator); } elsif ($ref_locator->{data_type} eq "image" and $ref_locator->{data_type} eq "image") { return image_extent_mismatch ($ref_locator, $tst_locator); } else { die "\n\nFATAL ERROR in extent_mismatch\n\n"; } } ################################# sub text_extent_mismatch { #This subroutine returns the maximum mismatch in the #character extent of two text streams my ($ref_locator, $tst_locator) = @_; my $extent_mismatch = 0; $extent_mismatch = max(abs($ref_locator->{start} - $tst_locator->{start}), abs($ref_locator->{end} - $tst_locator->{end}))/$max_diff_chars; } ################################# sub audio_extent_mismatch { #This subroutine returns the maximum mismatch in the #time extent of two audio signals my ($ref_locator, $tst_locator) = @_; my $extent_mismatch = 0; my ($ref_start, $ref_end, $tst_start, $tst_end); $ref_start = $ref_locator->{tstart}; $tst_start = $tst_locator->{tstart}; $ref_end = $ref_start + $ref_locator->{tdur}; $tst_end = $tst_start + $tst_locator->{tdur}; $extent_mismatch = max(abs($ref_start - $tst_start), abs($ref_end - $tst_end))/$max_diff_time; } ################################# sub image_extent_mismatch { #This subroutine returns the maximum mismatch in the #spatial extent of two images. my ($ref_locator, $tst_locator) = @_; my $ref_boxes = $ref_locator->{bblist}; my $tst_boxes = $tst_locator->{bblist}; my ($ref_box, $tst_box); my ($x_mismatch, $y_mismatch); my $huge_mismatch = 9999999; my $extent_mismatch = 0; foreach $ref_box (@$ref_boxes) { $x_mismatch = $y_mismatch = $huge_mismatch; foreach $tst_box (@$tst_boxes) { next if ($ref_box->{page} ne $tst_box->{page}); $x_mismatch = min($x_mismatch, abs($ref_box->{x_start} - $tst_box->{x_start}), abs($ref_box->{x_start}+$ref_box->{width} - $tst_box->{x_start}+$tst_box->{width})); $y_mismatch = min($y_mismatch, abs($ref_box->{y_start} - $tst_box->{y_start}), abs($ref_box->{y_start}+$ref_box->{height} - $tst_box->{y_start}+$tst_box->{height})); } $extent_mismatch = max($extent_mismatch, $x_mismatch, $y_mismatch); } return $extent_mismatch/$max_diff_xy; } ################################# sub max { my ($max, $x); $max = shift @_; foreach $x (@_) { if ($x > $max) {$max = $x} } return $max; } ################################# sub min { my ($min, $x); $min = shift @_; foreach $x (@_) { if ($x < $min) {$min = $x} } return $min; } ################################# sub get_document_data { my ($db, $docs, $file) = @_; my ($line, $name); my ($tag, $data, $doc_data, $span); my ($src_file, $source, $doc_id, %src_doc, $ndocs); my (@entities, $entity, $db_entity, $attribute); my (@relations, $relation, $db_relation, $argument, $arg, @args); my (@events, $event, $db_event, $id, $role, $participant); my (@timex2s, $timex2, $db_timex2); #read data from file open (FILE, $file) or die "\nUnable to open ACE data file '$input_file'", $usage; while ($line=){ $data .= $line; } close (FILE); #get source file information while (($tag, $doc_data, $data) = extract_sgml_tag_and_span ("source_file", $data)) { $data_type = lc extract_sgml_tag_attribute ($name="TYPE", $tag) or die "\n\nFATAL INPUT ERROR: no source file tag attribute '$name' in '$input_file'\n\n"; $data_type =~ /^(text|audio|image)$/ or die "\n\nFATAL INPUT ERROR: unknown document data type '$data_type' (in '$input_file')\n\n"; $src_file = extract_sgml_tag_attribute ($name="URI", $tag) or die "\n\nFATAL INPUT ERROR: no source file tag attribute '$name' in '$input_file'\n\n"; $source = extract_sgml_tag_attribute ($name="SOURCE", $tag) or die "\n\nFATAL INPUT ERROR: no source file tag attribute '$name' in '$input_file'\n\n"; $source_type = $source if not defined $source_type; $source_types{$source} = 1; $source_type = "MIXED" if $source ne $source_type; #get document data for all documents in the source file while (($tag, $span, $doc_data) = extract_sgml_tag_and_span ("document", $doc_data)) { $doc_id = extract_sgml_tag_attribute ("DOCID", $tag) or die "\n\nFATAL INPUT ERROR: no document ID found in tag '$tag' in file '$input_file'\n\n"; not defined $docs->{$doc_id} or die "\n\nFATAL INPUT ERROR: document ID '$doc_id' in file '$input_file' is not unique\n\n"; $input_doc = $doc_id; $docs->{$doc_id}{SOURCE} = $source; $docs->{$doc_id}{FILE} = $src_file; $docs->{$doc_id}{TYPE} = $data_type unless exists $docs->{$doc_id}{TYPE}; $data_type eq $docs->{$doc_id}{TYPE} or die "\n\nFATAL INPUT ERROR: all data for a given document must be of the same type\n". " data of type '$docs->{$doc_id}{TYPE} was previously loaded for document '$doc_id'\n". " but now file '$input_file' contains data of type '$data_type'\n\n"; $ndocs++; #load entity data into database @entities = get_entity_data ($span); foreach $entity (@entities) { $db->{entities}{$entity->{ID}} = {} unless defined $db->{entities}{$entity->{ID}}; $db_entity = $db->{entities}{$entity->{ID}}; $db_entity->{documents}{$doc_id} = {%$entity}; $db_entity->{documents}{$doc_id}{SOURCE} = $source; $db_entity->{ORIGIN} = "CORPUS" unless defined $db_entity->{ORIGIN}; foreach $attribute (@entity_attributes) { next unless defined $entity->{$attribute}; $db_entity->{$attribute} = $entity->{$attribute} unless defined $db_entity->{$attribute}; $entity->{$attribute} eq $db_entity->{$attribute} or die "\n\nFATAL INPUT ERROR: attribute value conflict for attribute '$attribute'" ." for entity '$entity->{ID}' in document '$doc_id'\n" ." database value is '$db_entity->{$attribute}'\n" ." document value is '$entity->{$attribute}'\n\n"; } promote_entity_mention_to_mention_entity ($db, $doc_id, $db_entity); } #load relation data into database @relations = get_relation_data ($span); foreach $relation (@relations) { $db->{relations}{$relation->{ID}} = {} unless defined $db->{relations}{$relation->{ID}}; $db_relation = $db->{relations}{$relation->{ID}}; $db_relation->{documents}{$doc_id} = {%$relation}; $db_relation->{documents}{$doc_id}{SOURCE} = $source; $db_relation->{ORIGIN} = "CORPUS" unless defined $db_relation->{ORIGIN}; foreach $attribute (@relation_attributes) { next unless defined $relation->{$attribute}; $db_relation->{$attribute} = $relation->{$attribute} unless defined $db_relation->{$attribute}; $relation->{$attribute} eq $db_relation->{$attribute} or die "\n\nFATAL INPUT ERROR: attribute value conflict for attribute '$attribute'" ." for relation '$relation->{ID}' in document '$doc_id'\n" ." database value is '$db_relation->{$attribute}'\n" ." document value is '$relation->{$attribute}'\n\n"; } foreach $time (@{$relation->{times}}) { push @{$db_relation->{times}}, $time unless num_relation_time_matches ([$time], $db_relation->{times}); } @args = defined $db_relation->{arguments} ? @{$db_relation->{arguments}} : (); delete $db_relation->{arguments}; foreach $argument (@{$relation->{arguments}}) { not @args or ($arg = shift @args) ne $argument or die "\n\nFATAL INPUT ERROR: argument conflict for relation '$relation->{ID}' in document '$doc_id'\n" ." input arg is '$argument->{ID}', but preexisting arg is '$arg'\n\n"; push @{$db_relation->{arguments}}, $argument; } promote_relation_mention_to_mention_relation ($db, $doc_id, $db_relation); } #load event data into database @events = get_event_data ($span); foreach $event (@events) { $db->{events}{$event->{ID}} = {} unless defined $db->{events}{$event->{ID}}; $db_event = $db->{events}{$event->{ID}}; $db_event->{documents}{$doc_id} = {%$event}; $db_event->{documents}{$doc_id}{SOURCE} = $source; $db_event->{ORIGIN} = "CORPUS" unless defined $db_event->{ORIGIN}; foreach $attribute (@event_attributes) { next unless defined $event->{$attribute}; $db_event->{$attribute} = $event->{$attribute} unless defined $db_event->{$attribute}; $event->{$attribute} eq $db_event->{$attribute} or die "\n\nFATAL INPUT ERROR: attribute value conflict for attribute '$attribute'" ." for event '$event->{ID}' in document '$doc_id'\n" ." database value is '$db_event->{$attribute}'\n" ." document value is '$event->{$attribute}'\n\n"; } foreach $role (keys %{$event->{participants}}) { foreach $id (keys %{$event->{participants}{$role}}) { $db_event->{participants}{$role}{$id} = $event->{participants}{$role}{$id}; } } defined $db_event->{participants} or die "\n\nFATAL INPUT ERROR: event $event->{ID} has no participants\n\n"; } #load timex2 data into database @timex2s = get_timex2_data ($span); foreach $timex2 (@timex2s) { $db->{timex2s}{$timex2->{ID}} = {} unless defined $db->{timex2s}{$timex2->{ID}}; $db_timex2 = $db->{timex2s}{$timex2->{ID}}; $db_timex2->{ID} = $timex2->{ID}; $db_timex2->{TYPE} = $timex2->{TYPE}; $db_timex2->{documents}{$doc_id} = {%$timex2}; $db_timex2->{documents}{$doc_id}{SOURCE} = $source; $db_timex2->{ORIGIN} = "CORPUS" unless defined $db_timex2->{ORIGIN}; $db_timex2->{attributes} = my $db_attributes = {} unless defined $db_timex2->{attributes}; my $doc_attributes = $timex2->{attributes}; foreach $attribute (keys %$doc_attributes) { $db_attributes->{$attribute} = $doc_attributes->{$attribute} unless defined $db_attributes->{$attribute}; $doc_attributes->{$attribute} eq $db_attributes->{$attribute} or die "\n\nFATAL INPUT ERROR: attribute value conflict for attribute '$attribute'" ." for timex2 '$timex2->{ID}' in document '$doc_id'\n" ." database value is '$db_attributes->{$attribute}'\n" ." document value is '$doc_attributes->{$attribute}'\n\n"; } } } } $ndocs or warn "\n\nWARNING: file '$input_file' contains no documents\n\n"; } ################################# sub promote_entity_mention_to_mention_entity { my ($db, $doc, $entity) = @_; my ($mention, $name, @mentions, @names, $new_id); my $num; @mentions = @{$entity->{documents}{$doc}{mentions}}; @names = @{$entity->{documents}{$doc}{names}}; foreach $mention (@mentions) { $num++; $entity_serial_number++; $new_id = "$entity->{ID} mention $num SN$entity_serial_number"; $db->{entities}{$entity->{ID}}{MENTION_ENTITY_IDS}{$mention->{ID}} = $new_id; $db->{mention_entities}{$new_id} = {%$entity}; $db->{mention_entities}{$new_id}{ID} = $new_id; $db->{mention_entities}{$new_id}{documents} = {%{$entity->{documents}}}; $db->{mention_entities}{$new_id}{documents} = {$doc => {%{$entity->{documents}{$doc}}}}; $db->{mention_entities}{$new_id}{documents}{$doc}{ID} = $new_id; $db->{mention_entities}{$new_id}{documents}{$doc}{mentions} = [{%$mention}]; my $max_overlap = $min_overlap; my $new_name; foreach $name (@names) { next unless defined $mention->{head}; my $overlap = span_overlap ($name->{locator}, $mention->{head}{locator}); next if $overlap < $max_overlap; $new_name = $name; $max_overlap = $overlap; } if ($new_name) { $mention->{TYPE} eq "NAME" or warn "\n\nWARNING: NAME found for un-NAME mention $new_id (name = '$new_name->{text}')\n\n"; } else { $mention->{TYPE} ne "NAME" or $mention->{STYLE} eq "METONYMIC" or warn "\n\nWARNING: no NAME found for NAME mention $new_id\n\n"; $new_name = $mention->{head} if $mention->{TYPE} eq "NAME" and $mention->{STYLE} eq "METONYMIC"; } $db->{mention_entities}{$new_id}{documents}{$doc}{names} = $new_name ? [{%$new_name}] : undef; } $num or warn "\n\nWARNING: no mentions found for entity '$entity->{ID}' in promote_entity_mention_to_mention_entity\n\n"; } ################################# sub promote_relation_mention_to_mention_relation { my ($db, $doc, $relation) = @_; my @entityids; my $num; my @mentions = @{$relation->{documents}{$doc}{mentions}}; foreach my $arg (@{$relation->{arguments}}) { $entityids[$arg->{ARGNUM}] = $arg->{ID}; } foreach my $mention (@mentions) { $num++; $relation_serial_number++; my $new_id = "$relation->{ID} mention $num SN$relation_serial_number"; my $mention_relation = $db->{mention_relations}{$new_id} = {%$relation}; $mention_relation->{documents} = {$doc => {%{$relation->{documents}{$doc}}}}; $mention_relation->{documents}{$doc}{ID} = $mention_relation->{ID} = $new_id; $mention_relation->{documents}{$doc}{mentions} = [$mention={%$mention}]; my @args; foreach my $arg (@{$mention->{arguments}}) { my $mention_entity_id = $db->{entities}{$entityids[$arg->{ARGNUM}]}{MENTION_ENTITY_IDS}{$arg->{MENTIONID}}; defined $mention_entity_id or die "\n\nFATAL ERROR: no mention entity ID for arg mention '$arg->{MENTIONID}' of relation '$relation->{ID}'\n\n"; push @args, {ARGNUM => $arg->{ARGNUM}, ID => $mention_entity_id}; } $mention_relation->{documents}{$doc}{arguments} = $mention_relation->{arguments} = [@args]; } $num or warn "\n\nWARNING: no mentions found for relation '$relation->{ID}' in promote_relation_mention_to_mention_relation\n\n"; } ################################# sub get_event_data { #extract document-level information for all events in the document my ($data) = @_; my (@events, %event_ids); my ($tag, $span, $type, $modality, $class, $attribute, $mention); while (($tag, $span, $data) = extract_sgml_tag_and_span ("event", $data)) { my %event; #get event ID ($input_event) = extract_sgml_tag_attribute ("ID", $tag) or die "\n\nFATAL INPUT ERROR: no event ID found for an event in file '$input_file'\n". " Event tag is '$tag'\nEvent data is '$span'\n"; $input_event =~ s/^\s*|\s*$//g; #trim any white space from beginning/end of event ID not defined $event_ids{$input_event} or die "\n\nFATAL ERROR: multiple definitions of event '$input_event'\n". " (every event ID must be unique)\n\n"; $event_ids{$input_event} = 1; $event{ID} = $input_event; #get event TYPE ($type) = extract_sgml_tag_attribute ("TYPE", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc'". " in file '$input_file'\n no event TYPE found. Event tag is '$tag'\n\n"; $event{TYPE} = $normalize_event_type{$type} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc'". " in file '$input_file'\n unrecognized event type ($type). Event tag is '$tag'\n\n"; #get event MODALITY ($modality) = extract_sgml_tag_attribute ("MODALITY", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc'". " in file '$input_file'\n no event MODALITY found. Event tag is '$tag'\n\n"; $event{MODALITY} = $normalize_event_modality{$modality} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc'". " in file '$input_file'\n unrecognized event sub-type ($modality). Event tag is '$tag'\n\n"; $event{participants} = get_event_participants ($span); $event{mentions} = [get_event_mentions ($span)]; promote_event_mention_participants (\%event); push @events, {%event}; } return @events; } ################################# sub get_event_participants { my ($event_data) = @_; my ($tag, $span, $participant, %participants); my $remaining_data = $event_data; while ($participant = get_event_participant ($remaining_data)) { not defined $participants{$participant->{ROLE}}{$participant->{ID}} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " Multiple entries for participant '$participant->{ID} in ROLE $participant->{ROLE}'.\n". " Event data is '$event_data'\n\n"; $participants{$participant->{ROLE}}{$participant->{ID}} = $participant; ($tag, $span, $remaining_data) = extract_sgml_tag_and_span ("event_participant", $remaining_data); } %participants or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " There must be at least one participant in the event, but there are none.\n". " Event data is '$event_data'\n\n"; return {%participants}; } ################################# sub get_event_participant { my ($data) = @_; my ($tag, $span, %participant, $role); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("event_participant", $data); $participant{ID} = extract_sgml_tag_attribute ("ENTITYID", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " No participant ENTITYID attribute. Participant data is '$data'\n\n"; $role = extract_sgml_tag_attribute ("ROLE", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " No ROLE attribute. Participant data is '$data'\n\n"; $participant{ROLE} = $normalize_event_participant_role{$role} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " Unrecognized ROLE attribute ($role). Participant data is '$data'\n\n"; return {%participant}; } ################################# sub get_event_mentions { my ($data) = @_; my ($tag, $span, @mentions, $mention); while ($mention = get_event_mention ($data)) { push @mentions, $mention; ($tag, $span, $data) = extract_sgml_tag_and_span ("event_mention", $data); } return @mentions; } ################################# sub get_event_mention { my ($data) = @_; my ($tag, $span, %mention, $type); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("event_mention", $data); $mention{ID} = extract_sgml_tag_attribute ("ID", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " ID attribute is missing. Event data is '$data'\n\n"; $type = extract_sgml_tag_attribute ("TYPE", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " TYPE attribute is missing. Event data is '$data'\n\n"; $mention{TYPE} = $normalize_event_mention_type{$type} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " Unrecognized TYPE attribute ($type). Participant data is '$data'\n\n"; $mention{extent} = get_locator ("extent", $data); $mention{anchor} = get_locator ("anchor", $data); $mention{participants} = get_event_mention_participants ($span); return {%mention}; } ################################# sub get_event_mention_participants { my ($mention_data) = @_; my ($tag, $span, $participant, %participants); my $remaining_data = $mention_data; while ($participant = get_event_mention_participant ($remaining_data)) { not defined $participants{$participant->{ROLE}}{$participant->{ID}} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " Multiple entries for participant '$participant->{ID}'. Mention data is '$mention_data'\n\n"; $participants{$participant->{ROLE}}{$participant->{ID}} = $participant; ($tag, $span, $remaining_data) = extract_sgml_tag_and_span ("event_mention_participant", $remaining_data); } return {%participants}; } ################################# sub get_event_mention_participant { my ($data) = @_; my ($tag, $span, %participant, $role, $source); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("event_mention_participant", $data); $participant{ID} = extract_sgml_tag_attribute ("ENTITYID", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " No participant ENTITYID attribute. Participant data is '$data'\n\n"; $participant{MENTIONID} = extract_sgml_tag_attribute ("ENTITYMENTIONID", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " No participant ENTITYMENTIONID attribute. Participant data is '$data'\n\n"; $role = extract_sgml_tag_attribute ("ROLE", $tag) or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " No ROLE attribute. Participant data is '$data'\n\n"; $participant{ROLE} = $normalize_event_participant_role{$role} or die "\n\nFATAL INPUT ERROR for event '$input_event' in document '$input_doc' in file '$input_file'\n". " Unrecognized ROLE attribute ($role). Participant data is '$data'\n\n"; $participant{extent} = get_locator ("extent", $span); return {%participant}; } ################################# sub promote_event_mention_participants { my ($event) = @_; my ($mention, $role, $id); foreach $mention (@{$event->{mentions}}) { foreach $role (keys %{$mention->{participants}}) { foreach $id (keys %{$mention->{participants}{$role}}) { $event->{participants}{$role}{$id} = $mention->{participants}{$role}{$id}; } } } } ################################# sub print_event_data { my ($type, $db) = @_; my ($event_id, $event, $role, $id); my ($doc_id, $doc, $doc_info); my ($attribute, $mention, $entity); print "\n======== $type events ========\n\n"; foreach $event_id (sort keys %{$db->{events}}) { $event = $db->{events}{$event_id}; print "event ID=$event->{ID}"; foreach $attribute (@event_attributes) { next if $attribute eq "ID"; print ", $attribute=$event->{$attribute}" if defined $event->{$attribute}; } print "\n"; my @participant_info; foreach $role (sort keys %{$event->{participants}}) { foreach $id (sort keys %{$event->{participants}{$role}}) { printf " %s\n", event_participant_description ($db, $event->{participants}{$role}{$id}); } } print_event_mentions ($db, $event_id); } } ################################# sub event_participant_description { my ($db, $participant, $text) = @_; my $id = $participant->{ID}; my $out = sprintf "%9s participant ID=%s", $participant->{ROLE}, $id; my $entity = $db->{entities}{$id}; if (my $name=longest_entity_name($entity)) { $out .= sprintf ", name=\"%s\"", limit_string($max_string_length, $name); } elsif (my $head=longest_entity_mention_head($entity)) { $out .= sprintf ", head=\"%s\"", limit_string($max_string_length, $head); } $out .= limit_string ($max_string_length, $text) if $text; return $out; } ################################# sub limit_string { my ($max_len, $string) = @_; my $len_str = length $string; return $string if $len_str <= $max_len; return substr ($string, 0, $max_len-3)."..."; } ################################# sub print_event_mentions { my ($db, $event_id, $text) = @_; my ($docs, $doc, @mentions, $mention, $role, $id); $docs = $db->{events}{$event_id}{documents}; foreach $doc (sort keys %{$docs}) { print " -- in document $doc\n"; @mentions = @{$docs->{$doc}{mentions}}; foreach $mention (@mentions) { printf " mention ID=$mention->{ID}, TYPE=$mention->{TYPE}, anchor=%s, extent=%s\n", limit_string ($max_string_length, $mention->{anchor}{text}), limit_string ($max_string_length, $mention->{extent}{text}); foreach $role (sort keys %{$mention->{participants}}) { foreach $id (sort keys %{$mention->{participants}{$role}}) { printf " %s\n", mention_participant_description ($db, $mention->{participants}{$role}{$id}); } } } } } ################################# sub mention_participant_description { my ($db, $participant, $text) = @_; my $out = sprintf "%9s participant mention ID=%s (%s), extent=%s", $participant->{ROLE}, $participant->{MENTIONID}, $participant->{ID}, $participant->{extent}{text} ? limit_string ($max_string_length, $participant->{extent}{text}) : "???"; $out .= sprintf ", text=\"$text\"" if $text; return $out; } ################################# sub compute_event_values { my ($ref_id, $ref_event, $tst_id, $tst_event, $value); my ($doc, $doc_event, $ref_occ, $tst_occ, %doc_events, %mapped_value_computed); my $ref_events = $ref_database{events}; my $tst_events = $tst_database{events}; #group ref events by document foreach $ref_id (keys %$ref_events) { $ref_event = $ref_events->{$ref_id}; foreach $doc (keys %{$ref_event->{documents}}) { $doc_events{$doc}{REF}{$ref_id} = $ref_event; $ref_event->{documents}{$doc}{VALUE} = mapped_event_document_value ($doc, \%ref_database, $ref_event); $ref_event->{VALUE} += $ref_event->{documents}{$doc}{VALUE}; } } #group tst events by document foreach $tst_id (keys %$tst_events) { $tst_event = $tst_events->{$tst_id}; foreach $doc (keys %{$tst_event->{documents}}) { $doc_events{$doc}{TST}{$tst_id} = $tst_event; $tst_event->{documents}{$doc}{VALUE} = mapped_event_document_value ($doc, \%tst_database, $tst_event); $tst_event->{VALUE} += $tst_event->{documents}{$doc}{VALUE}; } } #compute ref-tst mapped event values undef %mapped_event_value; foreach $doc (keys %doc_events) { foreach $ref_id (keys %{$doc_events{$doc}{REF}}) { foreach $tst_id (keys %{$doc_events{$doc}{TST}}) { next if $mapped_value_computed{$ref_id}{$tst_id}; ($mapped_event_value{$ref_id}{$tst_id}) = event_value (\%ref_database, $ref_events->{$ref_id}, \%tst_database, $tst_events->{$tst_id}); $mapped_value_computed{$ref_id}{$tst_id} = 1; } } } } ################################# sub map_events { my ($ref_id, $tst_id, $ref_event, $tst_event); my (@ref_events, @tst_events, @ref_cohorts, @tst_cohorts); #collect events foreach $ref_id (keys %{$ref_database{events}}) { $ref_event = $ref_database{events}{$ref_id}; push @ref_events, $ref_event; } foreach $tst_id (keys %{$tst_database{events}}) { $tst_event = $tst_database{events}{$tst_id}; push @tst_events, $tst_event; } #group events into cohort sets and map each set independently foreach $ref_event (@ref_events) { next if exists $ref_event->{cohort}; tag_cohorts ($ref_event, \@ref_events, \@tst_events, \%mapped_event_value, 1); @ref_cohorts = collect_cohorts (\@ref_events); @tst_cohorts = collect_cohorts (\@tst_events); map_cohorts (\@ref_cohorts, \@tst_cohorts, \%mapped_event_value, $event_fa_wgt); foreach $ref_event (@ref_cohorts) { $tst_event = $ref_event->{MAP}; next unless $tst_event; map_document_level_event_mentions ($ref_event, $tst_event); foreach my $doc (keys %{$ref_event->{documents}}, keys %{$tst_event->{documents}}) { next if defined $mapped_event_document_value{$doc}{$ref_event->{ID}}{$tst_event->{ID}}; $mapped_event_document_value{$doc}{$ref_event->{ID}}{$tst_event->{ID}} = mapped_event_document_value ($doc, \%ref_database, $ref_event, \%tst_database, $tst_event); } } } } ################################# sub map_document_level_event_mentions { my ($ref_event, $tst_event) = @_; my ($score, $mapping, $map, $p_ref, $p_tst, $doc, $ref_occ, $tst_occ); ($score, $mapping) = event_participants_score (\%ref_database, $ref_event, \%tst_database, $tst_event); #map event participants foreach $map (@$mapping) { next unless defined $map->{REFID}; $p_ref = $ref_event->{participants}{$map->{REFROLE}}{$map->{REFID}}; $p_tst = $tst_event->{participants}{$map->{TSTROLE}}{$map->{TSTID}}; $p_ref->{MAP} = $p_tst; $p_tst->{MAP} = $p_ref; } #map document level info foreach $doc (keys %{$ref_event->{documents}}) { $ref_occ = $ref_event->{documents}{$doc}; $tst_occ = $tst_event->{documents}{$doc}; $ref_occ->{MAP} = $tst_occ; $tst_occ->{MAP} = $ref_occ; foreach $map (@$mapping) { next unless defined $map->{REFID}; $p_ref = $ref_occ->{participants}{$map->{REFROLE}}{$map->{REFID}}; $p_tst = $tst_occ->{participants}{$map->{TSTROLE}}{$map->{TSTID}}; next unless $p_ref and $p_tst; $p_ref->{MAP} = $p_tst; $p_tst->{MAP} = $p_ref; } } } ################################# sub event_value { #N.B. The event mapping score must be undef if a match is not possible my ($ref_db, $ref_event, $tst_db, $tst_event) = @_; my ($role, $id, $participants_score, @ref_participants, @tst_participants); my ($event_value, $map, $ref_id, $tst_id, $ref_occ, $tst_occ); #events must exist my $self_score = not defined $tst_event; $tst_db = $ref_db if $self_score; $tst_event = $ref_event if $self_score; return undef unless $ref_db and $ref_event; #compute participants score ($participants_score, $map) = event_participants_score ($ref_db, $ref_event, $tst_db, $tst_event); foreach my $doc (keys %{$ref_event->{documents}}) { my $ref_occ = $ref_event->{documents}{$doc}; my $tst_occ = $tst_event->{documents}{$doc}; next unless $tst_occ; my $value = mapped_event_document_value ($doc, $ref_db, $ref_event, $tst_db, $tst_event, $map); $event_value += $value if $value; } return $event_value; } ################################# sub mapped_event_document_value { my ($doc, $ref_db, $ref_event, $tst_db, $tst_event, $map) = @_; my ($ref_mention, $tst_mention, $event_value, $participants_score); $participants_score = mapped_event_participants_document_score ($doc, $ref_db, $ref_event, $tst_db, $tst_event, $map); #give epsilon credit for overlapping anchors my $ref_occ = $ref_event->{documents}{$doc}; $tst_event = $ref_event unless $tst_event; my $tst_occ = $tst_event->{documents}{$doc}; foreach $ref_mention (@{$ref_occ->{mentions}}) { foreach $tst_mention (@{$tst_occ->{mentions}}) { next unless span_overlap($ref_mention->{anchor}{locator}, $tst_mention->{anchor}{locator}) > $min_overlap; $participants_score += $epsilon; } } return undef unless $participants_score; $event_value = min($event_type_wgt{$ref_event->{TYPE}} * $event_modality_wgt{$ref_event->{MODALITY}}, $event_type_wgt{$tst_event->{TYPE}} * $event_modality_wgt{$tst_event->{MODALITY}}); #reduce value for errors in event attributes $event_value *= $event_err_wgt{TYPE} if $ref_event->{TYPE} ne $tst_event->{TYPE}; $event_value *= $event_err_wgt{MODALITY} if $ref_event->{MODALITY} ne $tst_event->{MODALITY}; return $event_value*$participants_score; } ################################# sub event_participants_score { my ($ref_db, $ref_event, $tst_db, $tst_event) = @_; my ($role, $ref_id, $tst_id); my (@ref_participants, @tst_participants, $participant); my ($doc, $score, $fa_score); $tst_db = $ref_db unless defined $tst_db; $tst_event = $ref_event unless defined $tst_event; return undef unless defined $ref_db and defined $ref_event and defined $ref_event->{participants} and defined $tst_event->{participants}; #collect participants foreach $role (keys %{$ref_event->{participants}}) { foreach $ref_id (keys %{$ref_event->{participants}{$role}}) { $participant = {ID => $ref_id, ROLE => $role}; push @ref_participants, $participant; } } return undef unless @ref_participants; foreach $role (keys %{$tst_event->{participants}}) { foreach $tst_id (keys %{$tst_event->{participants}{$role}}) { $participant = {ID => $tst_id, ROLE => $role}; push @tst_participants, $participant; } } return undef unless @tst_participants; #collect participant mapping scores my %mapping_costs; my $self_score = $ref_db eq $tst_db; for (my $i=0; $i<@ref_participants; $i++) { $ref_id = $ref_participants[$i]->{ID}; my $min_value = $event_participant_threshold * $ref_db->{entities}{$ref_id}{VALUE}; for (my $j=0; $j<@tst_participants; $j++) { $tst_id = $tst_participants[$j]->{ID}; my $match_ok = $self_score ? 1 : (defined $mapped_entity_value{$ref_id}{$tst_id} ? $mapped_entity_value{$ref_id}{$tst_id} > (1-$required_precision)*$min_value : undef); $score = $fa_score = 0; foreach $doc (keys %{$tst_db->{events}{$tst_event->{ID}}{documents}}) { if ($self_score) { $score += $ref_db->{entities}{$ref_id}{documents}{$doc}{VALUE} if $i == $j; } elsif ($match_ok and defined $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}) { $score += $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}; } elsif (defined $tst_db->{entities}{$tst_id}{documents}{$doc} and defined $tst_db->{entities}{$tst_id}{documents}{$doc}{VALUE}) { $fa_score += $tst_db->{entities}{$tst_id}{documents}{$doc}{VALUE}; } } $score *= $event_participant_role_err_wgt if $ref_participants[$i]->{ROLE} ne $tst_participants[$j]->{ROLE}; $fa_score *= $event_participant_fa_wgt; $mapping_costs{$j}{$i} -= $score + $fa_score; } } return undef unless %mapping_costs; #find optimum mapping my ($map) = weighted_bipartite_graph_matching(\%mapping_costs) or die "\n\nFATAL ERROR: Event participant mapping through BGM FAILED\n\n"; my $participants_map; for (my $j=0; $j<@tst_participants; $j++) { my $i = $map->{$j}; push @$participants_map, {REFROLE => (defined $i ? $ref_participants[$i]->{ROLE} : undef), REFID => (defined $i ? $ref_participants[$i]->{ID} : undef), TSTROLE => $tst_participants[$j]->{ROLE}, TSTID => $tst_participants[$j]->{ID}}; } my $participants_score = 0; foreach $doc (keys %{$ref_db->{events}{$ref_event->{ID}}{documents}}) { $participants_score += mapped_event_participants_document_score ($doc, $ref_db, $ref_event, $tst_db, $tst_event, $participants_map); } return ($participants_score, $participants_map); } ################################# sub mapped_event_participants_document_score { my ($doc, $ref_db, $ref_event, $tst_db, $tst_event, $map) = @_; my ($ref_role, $ref_id, $tst_role, $tst_id, $mapped_participant); my $total_score = 0; my $self_score = $tst_event ? $ref_event eq $tst_event : 1; $map = get_event_participants_map ($ref_event, $self_score) if not $map; foreach $mapped_participant (@$map) { $ref_role = $mapped_participant->{REFROLE}; $ref_id = $mapped_participant->{REFID}; $tst_role = $mapped_participant->{TSTROLE}; $tst_id = $mapped_participant->{TSTID}; if (defined $ref_id) { my $score; if ($self_score) { $score = $ref_db->{entities}{$ref_id}{documents}{$doc}{VALUE}; } elsif (defined $mapped_entity_document_value{$doc}{$ref_id} and defined $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}) { $score = $mapped_entity_document_value{$doc}{$ref_id}{$tst_id}; $score *= $event_participant_role_err_wgt if $ref_role ne $tst_role; } $total_score += $score if $score; } else { next if not defined $tst_db->{entities}{$tst_id}{documents}{$doc}; my $fa_score -= $tst_db->{entities}{$tst_id}{documents}{$doc}{VALUE}; $fa_score *= $event_participant_fa_wgt; $total_score += $fa_score; } } return $total_score; } ################################# sub get_event_participants_map { my ($ref_event, $self_score) = @_; my ($tst_event, $ref_role, $ref_id, $tst_role, $tst_id, $map, $ref_map); if ($self_score) { foreach $ref_role (keys %{$ref_event->{participants}}) { foreach $ref_id (keys %{$ref_event->{participants}{$ref_role}}) { push @$map, {REFROLE => $ref_role, REFID => $ref_id, TSTROLE => $ref_role, TSTID => $ref_id}; } } return $map; } $tst_event = $ref_event->{MAP}; return undef unless $tst_event; foreach $tst_role (keys %{$tst_event->{participants}}) { foreach $tst_id (keys %{$tst_event->{participants}{$tst_role}}) { $ref_map = $tst_event->{participants}{$tst_role}{$tst_id}{MAP}; $ref_role = defined $ref_map ? $ref_map->{ROLE} : undef; $ref_id = defined $ref_map ? $ref_map->{ID} : undef; push @$map, {REFROLE => $ref_role, REFID => $ref_id, TSTROLE => $tst_role, TSTID => $tst_id}; } } return $map; } ################################# sub num_event_participants { my ($event) = @_; my ($role, $id); return 0 unless defined $event and defined $event->{participants}; #count participants my $n = 0; foreach $role (keys %{$event->{participants}}) { foreach $id (keys %{$event->{participants}{$role}}) { $n++; } } return $n; } ################################# sub print_event_mapping { my ($ref_db, $tst_db, $label) = @_; my ($ref_event, $tst_event, $ref_id, $tst_id, $role, $id); my ($print_data, $error_type, $pre_err, $post_err, $p_ref, $e_ref, $p_tst, $e_tst, $part_err); my ($type_mismatch, $modality_mismatch); print "\n======== $label ========\n\n"; foreach $ref_id (sort keys %{$ref_db->{events}}) { $print_data = $print_all_data; $ref_event = $ref_db->{events}{$ref_id}; if ($tst_event = $ref_event->{MAP}) { $tst_id = $tst_event->{ID}; undef $error_type; $type_mismatch = $ref_event->{TYPE} ne $tst_event->{TYPE}; $modality_mismatch = $ref_event->{MODALITY} ne $tst_event->{MODALITY}; $error_type = "TYPE" if $type_mismatch; $error_type .= defined $error_type ? "/MODALITY" : "MODALITY" if $modality_mismatch; $print_data ||= $print_err_data if $error_type; next unless $print_data; #print mapped events print "--------\n"; printf " event score:%9.5f out of%9.5f\n", $mapped_event_value{$ref_id}{$tst_id}, $ref_event->{VALUE}; $pre_err = $error_type ? ">>>" : " "; $post_err = $error_type ? " -- $error_type MISMATCH" : ""; print "$pre_err ref event $ref_id ($ref_event->{TYPE}/$ref_event->{MODALITY})$post_err\n"; foreach $role (sort keys %{$ref_event->{participants}}) { foreach $id (sort keys %{$ref_event->{participants}{$role}}) { $e_ref = $ref_db->{entities}{$id}; $p_ref = $ref_event->{participants}{$role}{$id}; $p_tst = $p_ref->{MAP}; $part_err = (not $p_tst) ? ", NO CORRESPONDING TST PARTICIPANT" : (((not $e_ref->{MAP}) ? ", UNMAPPED PARTICIPANT" : ($p_ref->{MAP}{ID} and $e_ref->{MAP}{ID} eq $p_ref->{MAP}{ID}) ? "" : ", MISMAPPED PARTICIPANT") .(not $p_tst or $p_tst->{ROLE} eq $role ? "" : ", ROLE MISMATCH")); printf "%s\n", ($part_err ? ">>> ":" ").event_participant_description ($ref_db, $p_ref, $part_err); } } print_event_mentions ($ref_db, $ref_id); print "$pre_err tst event $tst_id ($tst_event->{TYPE}/$tst_event->{MODALITY})$post_err\n"; foreach $role (sort keys %{$tst_event->{participants}}) { foreach $id (sort keys %{$tst_event->{participants}{$role}}) { $e_tst = $tst_db->{entities}{$id}; $p_tst = $tst_event->{participants}{$role}{$id}; $p_ref = $p_tst->{MAP}; $part_err = (not $p_ref) ? ", NO CORRESPONDING REF PARTICIPANT" : (((not $e_tst->{MAP}{ID}) ? ", UNMAPPED PARTICIPANT" : ($p_tst->{MAP}{ID} and $e_tst->{MAP}{ID} eq $p_tst->{MAP}{ID}) ? "" : ", MISMAPPED PARTICIPANT") .(not $p_ref or $p_ref->{ROLE} eq $role ? "" : ", ROLE MISMATCH")); printf "%s\n", ($part_err ? ">>> ":" ").event_participant_description ($tst_db, $p_tst, $part_err); } } print_event_mentions ($tst_db, $tst_id); } #print unmapped reference events else { $print_data ||= $print_err_data; next unless $print_data; print "--------\n"; printf " event score: 0.00000 out of%9.5f\n", $ref_event->{VALUE}; $pre_err = ">>>"; $post_err = " -- NO MATCHING TST EVENT"; print "$pre_err ref event $ref_id ($ref_event->{TYPE}/$ref_event->{MODALITY})$post_err\n"; foreach $role (sort keys %{$ref_event->{participants}}) { foreach $id (sort keys %{$ref_event->{participants}{$role}}) { printf " %s\n", event_participant_description ($ref_db, $ref_event->{participants}{$role}{$id}); } } print_event_mentions ($ref_db, $ref_id); } } #print unmapped test events $print_data ||= $print_err_data; return unless $print_data; foreach $tst_id (sort keys %{$tst_db->{events}}) { $tst_event = $tst_db->{events}{$tst_id}; next if $tst_event->{MAP}; print "--------\n"; printf " event score:%9.5f out of 0.00000\n", -$tst_event->{VALUE}; $pre_err = ">>>"; $post_err = " -- NO MATCHING REF EVENT"; print "$pre_err tst event $tst_id ($tst_event->{TYPE}/$tst_event->{MODALITY})$post_err\n"; foreach $role (sort keys %{$tst_event->{participants}}) { foreach $id (sort keys %{$tst_event->{participants}{$role}}) { printf " %s\n", event_participant_description ($tst_db, $tst_event->{participants}{$role}{$id}); } } print_event_mentions ($tst_db, $tst_id); } } ################################# sub get_timex2_data { #extract document-level information for all timex2s in the document my ($data) = @_; my (@timex2s, %timex2_ids); my ($tag, $span, $type, $modality, $class, $attribute, $mention); while (($tag, $span, $data) = extract_sgml_tag_and_span ("timex", $data)) { my %timex2; #get timex2 attributes and ID ($timex2{attributes}) = extract_sgml_tag_attributes ($tag); defined $timex2{attributes}{ID} or die "\n\nFATAL INPUT ERROR: no timex2 ID found for a timex2 in file '$input_file'\n". " Timex2 tag is '$tag'\nTimex2 data is '$span'\n"; defined $timex2{attributes}{TYPE} or die "\n\nFATAL INPUT ERROR: no timex2 TYPE found for a timex2 in file '$input_file'\n". " Timex2 tag is '$tag'\nTimex2 data is '$span'\n"; $timex2{attributes}{TYPE} eq "TIMEX2" or die "\n\nFATAL INPUT ERROR: timex2 TYPE must be TIMEX2 but is '$timex2{attributes}{TYPE}' in file '$input_file'\n". " Timex2 tag is '$tag'\nTimex2 data is '$span'\n"; $input_timex2 = $timex2{attributes}{ID}; $input_timex2 =~ s/^\s*|\s*$//g; #trim any white space from beginning/end of timex2 ID not defined $timex2_ids{$input_timex2} or die "\n\nFATAL ERROR: multiple definitions of timex2 '$input_timex2'\n". " (every timex2 ID must be unique)\n\n"; $timex2_ids{$input_timex2} = 1; $timex2{mentions} = [get_timex2_mentions (\%timex2, $span)]; #set default values $timex2{attributes}{SET} = "NO" unless defined $timex2{attributes}{SET}; $timex2{attributes}{NON_SPECIFIC} = "NO" unless defined $timex2{attributes}{NON_SPECIFIC}; #move ID and TYPE out of attribute hash $timex2{ID} = $input_timex2; delete $timex2{attributes}{ID}; $timex2{TYPE} = $timex2{attributes}{TYPE}; delete $timex2{attributes}{TYPE}; push @timex2s, {%timex2}; } return @timex2s; } ################################# sub get_timex2_mentions { #extract mention information for all mentions of a timex2 in a document my ($timex2, $data) = @_; my (%mention, @mentions); my ($tag, $span, $attribute, $head); while (($tag, $span, $data) = extract_sgml_tag_and_span ("timex_mention", $data)) { undef %mention; #get timex2 mention attributes ($mention{attributes}) = extract_sgml_tag_attributes ($tag); if (defined $mention{attributes}{ID}) { $mention{ID} = $mention{attributes}{ID}; delete $mention{attributes}{ID}; } #promote timex2 mention attributes foreach $attribute (keys %{$mention{attributes}}) { if (not $timex2->{attributes}{$attribute}) { $timex2->{attributes}{$attribute} = $mention{attributes}{$attribute}; } elsif ($attribute eq "COMMENT") { $timex2->{attributes}{$attribute} .= "\n$mention{attributes}{$attribute}"; } else { $timex2->{attributes}{$attribute} eq $mention{attributes}{$attribute} or die "\n\nFATAL INPUT ERROR for timex2 '$input_timex2' in document '$input_doc'\n". " inconsistent attribute values for attribute $attribute". " ('$timex2->{attributes}{$attribute}' versus '$mention{attributes}{$attribute}')\n\n"; } } #get timex2 mention head $mention{head} = get_locator ("head", $span); #get timex2 mention extent $mention{extent} = get_locator ("extent", $span); defined $mention{head} or defined $mention{extent} or die "\n\nFATAL INPUT ERROR for timex2 '$input_timex2' in document '$input_doc'\n". " no mention head or extent found in data ($span)\n\n"; push @mentions, {%mention}; } @mentions > 0 or die "\n\nFATAL INPUT ERROR for timex2 '$input_timex2' in document '$input_doc'\n". " timex2 contains no mentions\n\n"; return @mentions; } ################################# sub compute_timex2_values { my ($ref_id, $ref_timex2, $tst_id, $tst_timex2, $value); my ($doc, $doc_timex2, $ref_occ, $tst_occ, %doc_refs, %doc_tsts); my $ref_timex2s = $ref_database{timex2s}; my $tst_timex2s = $tst_database{timex2s}; #group ref timex2s by document foreach $ref_id (keys %$ref_timex2s) { $ref_timex2 = $ref_timex2s->{$ref_id}; $ref_timex2->{VALUE} = 0; foreach $doc (keys %{$ref_timex2->{documents}}) { $ref_occ = $ref_timex2->{documents}{$doc}; push @{$doc_refs{$doc}}, $ref_occ; } } #group tst timex2s by document foreach $tst_id (keys %$tst_timex2s) { $tst_timex2 = $tst_timex2s->{$tst_id}; $tst_timex2->{FA_VALUE} = 0; foreach $doc (keys %{$tst_timex2->{documents}}) { $tst_occ = $tst_timex2->{documents}{$doc}; push @{$doc_tsts{$doc}}, $tst_occ; } } undef %mapped_timex2_document_value; undef %mapped_timex2_value; foreach $doc (keys %eval_docs) { foreach $ref_occ (@{$doc_refs{$doc}}) { #compute ref timex2 values $ref_id = $ref_occ->{ID}; ($ref_occ->{VALUE}) = timex2_document_value ($ref_occ); $ref_timex2s->{$ref_id}{VALUE} += $ref_occ->{VALUE}; #compute ref-tst mapped timex2 values foreach $tst_occ (@{$doc_tsts{$doc}}) { $tst_id = $tst_occ->{ID}; ($value) = timex2_document_value ($ref_occ, $tst_occ); next unless $value; $mapped_timex2_document_value{$doc}{$ref_id}{$tst_id} = $value; $mapped_timex2_value{$ref_id}{$tst_id} += $value; } } } #compute tst timex2 values foreach $doc (keys %eval_docs) { foreach $tst_occ (@{$doc_tsts{$doc}}) { $tst_id = $tst_occ->{ID}; ($tst_occ->{VALUE}) = timex2_document_value ($tst_occ); $tst_timex2s->{$tst_id}{VALUE} += $tst_occ->{VALUE}; ($tst_occ->{FA_VALUE}) = timex2_document_value (undef, $tst_occ); $tst_timex2s->{$tst_id}{FA_VALUE} += $tst_occ->{FA_VALUE}; } } } ################################# sub timex2_value { my ($timex2) = @_; my $total_value; foreach my $doc (keys %{$timex2->{documents}}) { my $doc_timex2 = $timex2->{documents}{$doc}; ($doc_timex2->{VALUE}) = timex2_document_value($doc_timex2); $total_value += $doc_timex2->{VALUE}; } return $total_value; } ################################# sub mapped_timex2_value { my ($ref_timex2, $tst_timex2) = @_; my ($total_value, $value); foreach my $doc (keys %{$ref_timex2->{documents}}) { next unless defined $tst_timex2->{documents}{$doc}; ($value) = timex2_document_value($ref_timex2->{documents}{$doc}, $tst_timex2->{documents}{$doc}); $total_value += $value if $value; } return $total_value; } ################################# sub timex2_document_value { my ($ref_doc_timex2, $tst_doc_timex2) = @_; my $fa_timex2 = not $ref_doc_timex2; #calculate FA score if ref is null $ref_doc_timex2 = $tst_doc_timex2 unless $ref_doc_timex2; $tst_doc_timex2 = $ref_doc_timex2 unless $tst_doc_timex2; return undef unless $ref_doc_timex2; my $ref_mentions = $ref_doc_timex2->{mentions}; my $tst_mentions = $tst_doc_timex2->{mentions}; my ($mention_value, %mapping_costs); for (my $i=0; $i<@$ref_mentions; $i++) { for (my $j=0; $j<@$tst_mentions; $j++) { $mention_value = timex2_mention_score ($ref_mentions->[$i], $tst_mentions->[$j]); next unless defined $mention_value; $mapping_costs{$j}{$i} -= $mention_value + $timex2_fa_wgt; } } return undef unless %mapping_costs; #find optimum mapping of ref mentions to tst mentions my ($map) = weighted_bipartite_graph_matching(\%mapping_costs) or die "\n\nFATAL ERROR: Document level timex2 mention mapping through BGM FAILED\n\n"; undef $map if $fa_timex2; #calculate FA score if ref is null my ($doc_mentions_value, $mentions_map); for (my $j=0; $j<@$tst_mentions; $j++) { my $i = $map->{$j}; if (defined $i) { $doc_mentions_value -= $mapping_costs{$j}{$i}; $mentions_map->{$i} = $j; } $doc_mentions_value -= $timex2_fa_wgt; } return undef unless defined $doc_mentions_value; #compute value my $timex2_value = $timex2_detection_wgt; foreach my $attribute (keys %timex2_attribute_wgt) { my $ref_att = $ref_doc_timex2->{attributes}{$attribute}; my $tst_att = $tst_doc_timex2->{attributes}{$attribute}; $timex2_value += $timex2_attribute_wgt{$attribute} * ($ref_att eq $tst_att ? 1 : $epsilon) if (defined $ref_att and defined $tst_att); } $timex2_value = max($timex2_value,$epsilon); return $timex2_value*$doc_mentions_value, $mentions_map; } ################################# sub timex2_mention_score { #N.B. The mention mapping score must be undef if tst doesn't match ref. my ($ref_mention, $tst_mention) = @_; my ($ref_locator, $tst_locator, $overlap); $ref_locator = $ref_mention->{head}{locator}; $tst_locator = $tst_mention->{head}{locator}; return (span_overlap($ref_locator, $tst_locator) < $min_overlap ? undef : 1) if $ref_locator and $tst_locator; $ref_locator = $ref_mention->{extent}{locator}; $tst_locator = $tst_mention->{extent}{locator}; return (span_overlap($ref_locator, $tst_locator, \&min) < $min_overlap ? undef : 1) if $ref_locator and $tst_locator; return undef; } ################################# sub map_timex2s { my ($ref_id, $tst_id, $ref_timex2, $tst_timex2); my (@ref_timex2s, @tst_timex2s, @ref_cohorts, @tst_cohorts); #collect timex2s foreach $ref_id (keys %{$ref_database{timex2s}}) { $ref_timex2 = $ref_database{timex2s}{$ref_id}; push @ref_timex2s, $ref_timex2; } foreach $tst_id (keys %{$tst_database{timex2s}}) { $tst_timex2 = $tst_database{timex2s}{$tst_id}; push @tst_timex2s, $tst_timex2; } #group timex2s into cohort sets and map each set independently foreach $ref_timex2 (@ref_timex2s) { next if exists $ref_timex2->{cohort}; tag_cohorts ($ref_timex2, \@ref_timex2s, \@tst_timex2s, \%mapped_timex2_value, 1); @ref_cohorts = collect_cohorts (\@ref_timex2s); @tst_cohorts = collect_cohorts (\@tst_timex2s); map_cohorts (\@ref_cohorts, \@tst_cohorts, \%mapped_timex2_value, $timex2_fa_wgt); foreach $ref_timex2 (@ref_cohorts) { $tst_timex2 = $ref_timex2->{MAP}; next unless $tst_timex2; map_document_level_timex2_mentions ($ref_timex2, $tst_timex2); } } } ################################# sub map_document_level_timex2_mentions { my ($ref_timex2, $tst_timex2) = @_; foreach my $doc (keys %{$ref_timex2->{documents}}) { my $ref_occ = $ref_timex2->{documents}{$doc}; my $tst_occ = $tst_timex2->{documents}{$doc}; next unless $tst_occ; $ref_occ->{MAP} = $tst_occ; $tst_occ->{MAP} = $ref_occ; #map mentions (my $value, my $map) = timex2_document_value($ref_occ, $tst_occ); my $ref_mentions = $ref_occ->{mentions}; my $tst_mentions = $tst_occ->{mentions}; foreach my $i (keys %$map) { my $j = $map->{$i}; $ref_mentions->[$i]{MAP} = $tst_mentions->[$j]; $tst_mentions->[$j]{MAP} = $ref_mentions->[$i]; } } } ################################# sub print_timex2_data { my ($type, $db) = @_; my ($timex2_id); print "\n======== $type timexs ========\n\n"; foreach $timex2_id (sort keys %{$db->{timex2s}}) { print_timex2 ($db->{timex2s}{$timex2_id}); } } ################################# sub print_timex2 { my ($timex2) = @_; my $total_value; printf "timex2 ID=$timex2->{ID}, VALUE=%.5f, TYPE=$timex2->{TYPE}", timex2_value($timex2); foreach my $attribute (sort keys %{$timex2->{attributes}}) { print ", $attribute=$timex2->{attributes}{$attribute}" if defined $timex2->{attributes}{$attribute}; } print "\n"; foreach my $doc (sort keys %{$timex2->{documents}}) { my $doc_info = $timex2->{documents}{$doc}; print " -- in document $doc\n"; foreach my $mention (sort sort_on_locator @{$doc_info->{mentions}}) { printf " mention extent=\"%s\"\n", defined $mention->{extent}{text} ? $mention->{extent}{text} : "???"; } } } ################################# sub print_timex2_mapping { my ($ref_db, $tst_db, $label) = @_; my ($ref_timex2, $tst_timex2, $ref_id, $tst_id, $doc, $ref_occ, $tst_occ, $attribute, $output); print "\n======== $label ========\n\n"; foreach $ref_id (sort keys %{$ref_db->{timex2s}}) { my $print_data = $print_all_data; $output = "--------\n"; $ref_timex2 = $ref_db->{timex2s}{$ref_id}; if ($tst_timex2 = $ref_timex2->{MAP}) { $tst_id = $tst_timex2->{ID}; my $attribute_error; foreach $attribute (sort keys %timex2_attribute_wgt) { next if (not defined $ref_timex2->{attributes}{$attribute} and not defined $tst_timex2->{attributes}{$attribute}); next if (defined $ref_timex2->{attributes}{$attribute} and defined $tst_timex2->{attributes}{$attribute} and $ref_timex2->{attributes}{$attribute} eq $tst_timex2->{attributes}{$attribute}); $attribute_error .= "/" if $attribute_error; $attribute_error .= $attribute; } $print_data ||= $print_err_data if $attribute_error; $output .= ($attribute_error ? ">>> " : " ")."ref timex2 $ref_id"; foreach $attribute (sort keys %timex2_attribute_wgt) { next if not defined $ref_timex2->{attributes}{$attribute}; $output .= ", $attribute=$ref_timex2->{attributes}{$attribute}"; } $output .= "\n"; $output .= ($attribute_error ? ">>> " : " ")."tst timex2 $tst_id"; foreach $attribute (sort keys %timex2_attribute_wgt) { next if not defined $tst_timex2->{attributes}{$attribute}; $output .= ", $attribute=$tst_timex2->{attributes}{$attribute}"; } $output .= $attribute_error ? " -- ATTRIBUTE ERROR ($attribute_error)\n" : "\n"; $output .= sprintf (" timex2 score:%9.5f out of%9.5f\n", $mapped_timex2_value{$ref_id}{$tst_id}, $ref_timex2->{VALUE}); } else { $print_data ||= $print_err_data; $output .= ">>> ref timex2 $ref_id -- NO MATCHING TST TIMEX2\n"; $output .= sprintf (" timex2 score: 0.00000 out of%9.5f\n", $ref_timex2->{VALUE}); } foreach $doc (keys %{$ref_timex2->{documents}}) { next unless defined $eval_docs{$doc}; $ref_occ = $ref_timex2->{documents}{$doc}; print_timex2_mapping_details ($ref_occ, $ref_occ->{MAP}, $doc, $print_data, $output); $output = ""; } } foreach $tst_id (sort keys %{$tst_db->{timex2s}}) { $tst_timex2 = $tst_db->{timex2s}{$tst_id}; next if $tst_timex2->{MAP}; my $print_data = ($print_all_data or $print_err_data); $output = "--------\n"; $output .= ">>> tst timex2 $tst_id"; foreach $attribute (sort keys %timex2_attribute_wgt) { next if not defined $tst_timex2->{attributes}{$attribute}; $output .= ", $attribute=$tst_timex2->{attributes}{$attribute}"; } $output .= " -- NO MATCHING REF TIMEX2\n"; $output .= sprintf (" timex2 score:%9.5f out of 0.00000\n", $tst_timex2->{FA_VALUE}); foreach $doc (keys %{$tst_timex2->{documents}}) { next unless defined $eval_docs{$doc}; $tst_occ = $tst_timex2->{documents}{$doc}; print_timex2_mapping_details (undef, $tst_occ, $doc, $print_all_data, $output); $output = ""; } } } ################################# sub print_timex2_mapping_details { my ($ref_timex2, $tst_timex2, $doc, $print_data, $output) = @_; my ($type, $ref_mention, $tst_mention, $mention, @mentions); $output .= "- in document $doc:\n"; if ($ref_timex2) { foreach $mention (@{$ref_timex2->{mentions}}) { push @mentions, {DATA=>$mention, TYPE=>"REF"}; } } if ($tst_timex2) { foreach $mention (@{$tst_timex2->{mentions}}) { push @mentions, {DATA=>$mention, TYPE=>"TST"}; } } if ($ref_timex2 and $tst_timex2) { foreach $mention (sort sort_on_locator @mentions) { $type = $mention->{TYPE}; $mention = $mention->{DATA}; next if $type eq "TST" and $mention->{MAP}; if ($mention->{MAP}) { $ref_mention = $mention; $tst_mention = $mention->{MAP}; my $extent_error = extent_mismatch ($ref_mention->{extent}{locator}, $tst_mention->{extent}{locator}) > $epsilon; $print_data ||= $print_err_data if $extent_error; if (not $extent_error and defined $ref_mention->{extent}{text} and defined $tst_mention->{extent}{text} and $ref_mention->{extent}{text} eq $tst_mention->{extent}{text}) { $output .= " mention = \"" . $ref_mention->{extent}{text} . "\"\n"; } else { $output .= $extent_error ? ">>> " : " "; $output .= "ref mention = \"" . (defined $ref_mention->{extent}{text} ? $ref_mention->{extent}{text} : "???") . "\""; $output .= "\n"; $output .= $extent_error ? ">>> " : " "; $output .= "tst mention = \"" . (defined $tst_mention->{extent}{text} ? $tst_mention->{extent}{text} : "???") . "\""; $output .= $extent_error ? " -- EXTENT ERROR\n" : "\n"; } } else { $print_data ||= $print_err_data; $output .= ">>> ".(lc$type)." mention = \"" . (defined $mention->{extent}{text} ? $mention->{extent}{text} : "???") . "\""; $output .= " -- NO MATCHING %s MENTION\n", } } } else { $print_data ||= $print_err_data; foreach $mention (sort sort_on_locator @mentions) { $type = $mention->{TYPE}; $mention = $mention->{DATA}; $output .= " ".(lc$type)." mention = \"" . (defined $mention->{extent}{text} ? $mention->{extent}{text} : "???") . "\""; $output .= "\n"; } } print $output if $print_data; } ################################# sub get_entity_data { #extract document-level information for all entities in the document my ($span) = @_; my (@entities, %entity, %entity_ids); my ($tag, $data, $type, $class, $nentities); while (($tag, $data, $span) = extract_sgml_tag_and_span ("entity", $span)) { undef %entity; #get entity ID $input_entity = extract_sgml_tag_attribute ("ID", $tag) or die "\n\nFATAL INPUT ERROR: no entity ID found for an entity in file '$input_file'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; $input_entity =~ s/^\s*|\s*$//g; #trim any white space from beginning/end of entity ID not defined $entity_ids{$input_entity} or die "\n\nFATAL ERROR: multiple definitions of entity '$input_entity' in document '$input_doc'\n". " (entities may be defined only once per document)\n\n"; $entity_ids{$input_entity} = 1; $entity{ID} = $input_entity; #get entity TYPE $type = extract_sgml_tag_attribute ("TYPE", $tag) or die "\n\nFATAL INPUT ERROR: no entity TYPE found for entity '$input_entity'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; $entity{TYPE} = normalize_entity_type($type) or die "\n\nFATAL INPUT ERROR: unrecognized entity TYPE ($type) for entity '$input_entity'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; #get entity SUBTYPE $entity{SUBTYPE} = extract_sgml_tag_attribute ("SUBTYPE", $tag); $entity{SUBTYPE} = "" unless defined $entity{SUBTYPE}; $entity_subtypes{$entity{TYPE}}{$entity{SUBTYPE}} or die "\n\nFATAL INPUT ERROR: illegal entity SUBTYPE ($entity{SUBTYPE}) for entity type '$entity{TYPE}' for entity '$input_entity'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; #get entity CLASS $class = extract_sgml_tag_attribute ("CLASS", $tag) or die "\n\nFATAL INPUT ERROR: no entity CLASS found for entity '$input_entity'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; $entity{CLASS} = normalize_entity_class($class) or die "\n\nFATAL INPUT ERROR: unrecognized entity class ($class) for entity '$input_entity'\n". " entity tag is '$tag'\n entity data is '$data'\n\n"; #get other entity data $entity{mentions} = [get_entity_mentions (\%entity, $data)]; $entity{names} = [get_entity_names ($data)]; push @entities, {%entity}; $nentities++; } return @entities; } ################################# sub get_entity_mentions { #extract mention information for all mentions of an entity in a document my ($entity, $mention_data) = @_; my (%mention, @mentions); my ($tag, $data, $head); my ($type, $mention_role, $ref_style); while (($tag, $data, $mention_data) = extract_sgml_tag_and_span ("entity_mention", $mention_data)) { undef %mention; #get entity mention ID $mention{ID} = extract_sgml_tag_attribute ("ID", $tag); #get entity mention TYPE $type = extract_sgml_tag_attribute ("TYPE", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'\n". " no mention TYPE found in tag '$tag'\n mention data is '$data'\n\n"; $mention{TYPE} = $normalize_mention_type{$type} or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'\n". " unrecognized mention type ($type)\n mention data is '$data'\n\n"; #get entity mention ROLE ($mention_role) = extract_sgml_tag_attribute ("ROLE", $tag); $mention_role = $entity->{TYPE} unless defined $mention_role; $mention{ROLE} = normalize_entity_type($mention_role) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'\n". " unrecognized mention role ($mention_role)\n mention data is '$data'\n\n"; #get reference style ($ref_style) = extract_sgml_tag_attribute ("METONYMY_MENTION", $tag); ($ref_style) = extract_sgml_tag_attribute ("REFERENCE", $tag) unless defined $ref_style; ($ref_style) = extract_sgml_tag_attribute ("STYLE", $tag) unless defined $ref_style; $ref_style = "LITERAL" unless defined $ref_style; $ref_style = "METONYMIC" if $ref_style =~ /^(TRUE|INTENDED)$/; $ref_style =~ /^(LITERAL|METONYMIC)$/ or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'". " unrecognized mention reference ($ref_style)\n mention data is '$data'\n\n"; $mention{STYLE} = $ref_style; #get entity mention LDCTYPE and LDCATR $mention{LDCTYPE} = extract_sgml_tag_attribute ("LDCTYPE", $tag); $mention{LDCATR} = extract_sgml_tag_attribute ("LDCATR", $tag); #get entity mention extent $mention{extent} = get_locator ("extent", $data) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'\n". " no mention extent found in data ($data)\n\n"; #get entity mention head $head = get_locator ("head", $data); $mention{head} = $head if defined $head; push @mentions, {%mention}; } @mentions > 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'\n". " entity contains no mentions\n\n"; return @mentions; } ################################# sub reduce_entity_to_single_role { my ($entity) = @_; my ($doc, $doc_info, $mention); $entity->{documents} = {%{$entity->{documents}}}; foreach $doc (keys %{$entity->{documents}}) { $doc_info = $entity->{documents}{$doc} = {%{$entity->{documents}{$doc}}}; $doc_info->{ID} = $entity->{ID}; $doc_info->{TYPE} = $entity->{TYPE}; my @same_role_mentions; foreach $mention (@{$doc_info->{mentions}}) { push @same_role_mentions, {%$mention} if $mention->{ROLE} eq $entity->{TYPE}; } $doc_info->{mentions} = [@same_role_mentions]; delete $entity->{documents}{$doc} unless @same_role_mentions > 0; } } ################################# sub longest_entity_mention_head { my ($entity) = @_; my (@docs, $doc, @mentions, $mention, $text); my $longest_head=""; if ($entity->{documents}) { # this is a database entity @docs = keys %{$entity->{documents}}; foreach $doc (@docs) { @mentions = @{$entity->{documents}{$doc}{mentions}}; foreach $mention (@mentions) { $text = $mention->{head}{text} if defined $mention->{head}{text}; $longest_head = $text if defined $text and length($text) > length($longest_head); } } } else { # this is a document-level entity @mentions = @{$entity->{mentions}}; foreach $mention (@mentions) { $text = $mention->{head}{text} if defined $mention->{head}{text}; $longest_head = $text if defined $text and length($text) > length($longest_head); } } return length($longest_head) > 0 ? $longest_head : undef; } ################################# sub get_entity_names { #extract name information for all names of an entity in a document my ($data) = @_; my ($tag, $span, $name, @names); ($data) = extract_sgml_span ("entity_attributes", $data); while ($name=get_locator("name",$data)) { push @names, $name; ($tag, $span, $data) = extract_sgml_tag_and_span ("name", $data); } return @names; } ################################# sub longest_entity_name { my ($entity) = @_; my (@docs, $doc, @names, $name, $text); my $longest_name=""; if ($entity->{documents}) { # this is a database entity @docs = keys %{$entity->{documents}}; foreach $doc (@docs) { next unless defined $entity->{documents}{$doc}{names}; @names = @{$entity->{documents}{$doc}{names}}; foreach $name (@names) { $text = $name->{text}; $longest_name = $text if defined $text and length($text) > length($longest_name); } } } else { # this is a document-level entity return undef unless defined $entity->{names}; @names = @{$entity->{names}}; foreach $name (@names) { $text = $name->{text}; $longest_name = $text if defined $text and length($text) > length($longest_name); } } return length($longest_name) > 0 ? $longest_name : undef; } ################################# sub get_locator { my ($name, $data) = @_; my ($span, %info); return undef unless ($span) = extract_sgml_span ($name, $data); if ($data_type eq "text") { ($info{locator}) = get_text_locator ($span); $info{text} = $info{locator}{text}; } elsif ($data_type eq "audio") { ($info{locator}) = get_audio_locator ($span); $info{text} = $info{locator}{text}; } elsif ($data_type eq "image") { ($info{locator}) = [get_image_locator ($span)]; $info{text} = ""; foreach my $bbox (@{$info{locator}}) { $info{text} .= $bbox->{locator}{text}; } } else { die "\n\nFATAL INPUT ERROR for document '$input_doc' in file '$input_file'\n". " No locator read routine for '$name' locator in '$data' for '$data_type'\n\n"; } $info{text} =~ s/-\n//sg; $info{text} =~ s/\n/ /sg; $info{text} =~ s/\s+/ /sg; $info{locator}{data_type} = $data_type; return {%info}; } ################################# sub get_text_locator { my ($data) = @_; my (%locator, $tag, $span); ($tag, $span) = extract_sgml_tag_and_span ("charseq", $data) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'". "in file '$input_file'\n text mention contains no position info (no 'charseq' tag): '$data'\n\n"; ($locator{start}) = extract_sgml_tag_attribute ("START", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'START' attribute found in data '$data'\n\n"; ($locator{end}) = extract_sgml_tag_attribute ("END", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'END' attribute found in data '$data'\n\n"; $locator{end}-$locator{start} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative text span in data '$data'\n\n"; $locator{start} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative character index in data '$data'\n\n"; ($locator{text}) = $span; return {%locator}; } ################################# sub get_audio_locator { my ($data) = @_; my (%locator, $tag, $span); ($tag, $span) = extract_sgml_tag_and_span ("timespan", $data) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'". "in file '$input_file'\n audio mention contains no timing info (no 'times' tag): '$data'\n\n"; ($locator{tstart}) = extract_sgml_tag_attribute ("START", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'START' tag found in data '$data'\n\n"; ($locator{tdur}) = extract_sgml_tag_attribute ("END", $tag) - $locator{tstart} or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'END' tag found in data '$data'\n\n"; $locator{tdur} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative time duration in data '$data'\n\n"; $locator{tstart} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative start time in data '$data'\n\n"; ($locator{text}) = $span; return {%locator}; } ################################# sub get_image_locator { my ($data) = @_; my (%box, @boxlist); my ($tag, $span); ($data) = extract_sgml_span ("bblist", $data); my $nboxes = 0; while (($tag, $span, $data) = extract_sgml_tag_and_span ("pixelboundingbox", $data)) { $box{page} = extract_sgml_tag_attribute ("Signal", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'Signal' attribute found in tag '$tag'\n\n"; $box{x_start} = extract_sgml_tag_attribute ("x1", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'x1' tag found in data '$tag'\n\n"; $box{y_start} = extract_sgml_tag_attribute ("y1", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'y1' tag found in data '$tag'\n\n"; $box{width} = extract_sgml_tag_attribute ("x2", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'x2' tag found in data '$tag'\n\n"; $box{height} = extract_sgml_tag_attribute ("y2", $tag) or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " No 'y2' tag found in data '$tag'\n\n"; $box{width} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative bounding box width in data '$tag'\n\n"; $box{height} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative bounding box height in data '$tag'\n\n"; $box{x_start} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative 'x1' in data '$tag'\n\n"; $box{y_start} >= 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in file '$input_file'\n". " Negative 'y1' in data '$tag'\n\n"; $box{text} = $span; push @boxlist, {%box}; $nboxes++; } $nboxes > 0 or die "\n\nFATAL INPUT ERROR for entity '$input_entity' in document '$input_doc'". "in file '$input_file'\n image mention contains no boxes (no 'pixelboundingbox' tag): '$data'\n\n"; return @boxlist; } ################################# sub sort_on_locator { my $ax = $a; my $bx = $b; ($ax, $bx) = ($ax->{DATA}, $bx->{DATA}) if defined $ax->{DATA} and defined $bx->{DATA}; $ax = defined $ax->{head}{locator} ? $ax->{head}{locator} : defined $ax->{extent}{locator} ? $ax->{extent}{locator} : defined $ax->{locator} ? $ax->{locator} : die "\n\nFATAL ERROR in input to sort_on_locator\n\n"; $bx = defined $bx->{head}{locator} ? $bx->{head}{locator} : defined $bx->{extent}{locator} ? $bx->{extent}{locator} : defined $bx->{locator} ? $bx->{locator} : die "\n\nFATAL ERROR in input to sort_on_locator\n\n"; if ($ax->{data_type} eq "text" and $bx->{data_type} eq "text") { return $ax->{start} <=> $bx->{start}; } elsif ($ax->{data_type} eq "audio" and $bx->{data_type} eq "audio") { return $ax->{tstart} <=> $bx->{tstart}; } elsif ($ax->{data_type} eq "image" and $bx->{data_type} eq "image") { my $ax_box = $ax->{bblist}[0]; my $bx_box = $bx->{bblist}[0]; my $cmp = $ax_box->{page} <=> $bx_box->{page}; return $cmp if $cmp; $cmp = $ax_box->{y_start} <=> $bx_box->{y_start}; return $cmp if $cmp; $cmp = $ax_box->{x_start} <=> $bx_box->{x_start}; return $cmp; } else { die "\n\nFATAL ERROR in sort_on_locator\n\n"; } } ################################# sub get_relation_data { #extract document-level information for all relations in the document my ($data) = @_; my (@relations, %relation_ids); my ($tag, $span, $type, $subtype, $class, $attribute, $mention); while (($tag, $span, $data) = extract_sgml_tag_and_span ("relation", $data)) { my %relation; #get relation ID ($input_relation) = extract_sgml_tag_attribute ("ID", $tag) or die "\n\nFATAL INPUT ERROR: no relation ID found for a relation in file '$input_file'\n". " Relation tag is '$tag'\n Relation data is '$span'\n\n"; $input_relation =~ s/^\s*|\s*$//g; #trim any white space from beginning/end of relation ID not defined $relation_ids{$input_relation} or die "\n\nFATAL ERROR: multiple definitions of relation '$input_relation'\n". " (every relation ID must be unique)\n\n"; $relation_ids{$input_relation} = 1; $relation{ID} = $input_relation; #get relation TYPE ($type) = extract_sgml_tag_attribute ("TYPE", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc'". " in file '$input_file'\n no relation TYPE found. Relation tag is '$tag'\n\n"; $relation{TYPE} = $normalize_relation_type{$type} or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc'". " in file '$input_file'\n unrecognized relation type ($type). Relation tag is '$tag'\n\n"; #get relation SUBTYPE ($subtype) = extract_sgml_tag_attribute ("SUBTYPE", $tag) or $subtype = ""; $relation{SUBTYPE} = $subtype; $relation_subtypes{$relation{TYPE}}{$relation{SUBTYPE}} or die "\n\nFATAL INPUT ERROR: illegal relation SUBTYPE ($relation{SUBTYPE}) for relation type $relation{TYPE}\n". " Relation tag is '$tag'\n Relation data is '$data'\n\n"; #get relation CLASS ($class) = extract_sgml_tag_attribute ("CLASS", $tag); $class = "EXPLICIT" unless defined $class; $relation{CLASS} = $normalize_relation_class{$class} or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc'". " in file '$input_file'\n unrecognized relation class ($class). Relation tag is '$tag'\n\n"; $relation{arguments} = [get_relation_arguments ($span)]; $relation{mentions} = [get_relation_mentions ($span)]; $relation{times} = [get_times ("relation_time", $span)]; promote_relation_mention_times (\%relation); push @relations, {%relation}; } return @relations; } ################################# sub promote_relation_mention_times { my ($relation) = @_; my ($mention, $time); foreach $mention (@{$relation->{mentions}}) { foreach $time (@{$mention->{times}}) { next if null_relation_time ($time); my %time = %{$time}; delete $time{text}; push @{$relation->{times}}, \%time unless num_relation_time_matches ([$time], $relation->{times}); } } } ################################# sub null_relation_time { my ($time) = @_; foreach my $attribute (@relation_time_attributes) { return "" if $time->{$attribute}; } return 1; } ################################# sub get_relation_arguments { my ($data) = @_; my ($tag, $span, $argument, @arguments); while ($argument = get_relation_argument ($data)) { push @arguments, $argument; ($tag, $span, $data) = extract_sgml_tag_and_span ("rel_entity_arg", $data); } @arguments = sort {$a->{ARGNUM} <=> $b->{ARGNUM}} @arguments; return @arguments; } ################################# sub get_relation_argument { my ($data) = @_; my ($tag, $span, $num, %argument); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("rel_entity_arg", $data); $argument{ARGNUM} = extract_sgml_tag_attribute ("ARGNUM", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " No ARGNUM attribute. Relation data is '$data'\n\n"; $argument{ARGNUM} == 1 or $argument{ARGNUM} == 2 or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " ARGNUM must be '1' or '2', but ARGNUM is '$argument{ARGNUM}'. Relation data is '$data'\n\n"; $argument{ID} = extract_sgml_tag_attribute ("ENTITYID", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " No argument ENTITYID attribute. Relation data is '$data'\n\n"; return {%argument}; } ################################# sub get_times { my ($name, $data) = @_; my ($tag, $span, $time, @times); while ($time = get_time ($name, $data)) { push @times, $time; ($tag, $span, $data) = extract_sgml_tag_and_span ($name, $data); } return @times; } ################################# sub get_time { my ($name, $data) = @_; my ($tag, $span, $attribute, %time); return undef unless ($tag, $span) = extract_sgml_tag_and_span ($name, $data); foreach $attribute (@relation_time_attributes) { $time{$attribute} = extract_sgml_tag_attribute ($attribute, $tag); } ($time{source}) = get_locator ("source", $span); return {%time}; } ################################# sub get_relation_mentions { my ($data) = @_; my ($tag, $span, @mentions, $mention); while ($mention = get_relation_mention ($data)) { push @mentions, $mention; ($tag, $span, $data) = extract_sgml_tag_and_span ("relation_mention", $data); } return @mentions; } ################################# sub get_relation_mention { my ($data) = @_; my ($tag, $span, %mention); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("relation_mention", $data); $mention{ID} = extract_sgml_tag_attribute ("ID", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " ID attribute is missing. Relation data is '$data'\n\n"; $mention{LC} = extract_sgml_tag_attribute ("LDCLEXICALCONDITION", $tag); $mention{LC} = "unknown" unless $mention{LC}; $mention{extent} = get_locator ("ldc_extent", $data); $mention{arguments} = [get_relation_mention_arguments ($span)]; $mention{times} = [get_times ("rel_mention_time", $span)]; return {%mention}; } ################################# sub get_relation_mention_arguments { my ($data) = @_; my ($tag, $span, $argument, @arguments); while ($argument = get_relation_mention_argument ($data)) { push @arguments, $argument; ($tag, $span, $data) = extract_sgml_tag_and_span ("rel_mention_arg", $data); } @arguments = sort {$a->{ARGNUM} <=> $b->{ARGNUM}} @arguments; return @arguments; } ################################# sub get_relation_mention_argument { my ($data) = @_; my ($tag, $span, $num, %argument); return undef unless ($tag, $span) = extract_sgml_tag_and_span ("rel_mention_arg", $data); $argument{ARGNUM} = extract_sgml_tag_attribute ("ARGNUM", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " No ARGNUM attribute. Relation data is '$data'\n\n"; $argument{MENTIONID} = extract_sgml_tag_attribute ("ENTITYMENTIONID", $tag) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " No argument ENTITYMENTIONID attribute. Relation data is '$data'\n\n"; $argument{extent} = get_locator ("extent", $span) or die "\n\nFATAL INPUT ERROR for relation '$input_relation' in document '$input_doc' in file '$input_file'\n". " No argument extent. Relation data is '$data'\n\n"; return {%argument}; } ################################# sub extract_sgml_span { my ($name, $data) = @_; return () unless defined $name and defined $data; return () unless ($data =~ /<$name(\s+[^>]*)*>(.*)/si); my $tag = $1; my $remainder = $2; return (undef, $remainder) if ($tag and $tag =~ /\/$/s); ($data =~ /<$name(\s+[^>]*)*>(.*?)<\/$name\s*>(.*)/si) ? ($2, $3) : (); } ################################# sub extract_sgml_tag_and_span { my ($name, $data) = @_; return () unless defined $name and defined $data; return () unless ($data =~ /<$name(\s+[^>]*)*>(.*)/si); my $tag = $1; my $remainder = $2; return ($tag, undef, $remainder) if ($tag and $tag =~ /\/$/s); ($data =~ /<$name(\s+[^>]*)*>(.*?)<\/$name\s*>(.*)/si) ? ($1, $2, $3) : (); } ################################# sub extract_sgml_tag_attribute { my ($name, $data) = @_; return () unless defined $name and defined $data; my $attr = ($data =~ /(^|\s)$name\s*=\s*\"\s*([^\"]*)\s*\"/si) ? ($2) : (); return () if not defined $attr or $attr =~ /^\s*$/; $attr =~ s/^\s*|\s*$//g; return $attr; } ################################# sub extract_sgml_tag_attributes { my ($data) = @_; my %attributes; return () unless defined defined $data; while ($data =~ s/(^|\s+)([^\s]+)\s*=\s*\"\s*([^\"]*)\s*\"//si) { my $type = uc $2; my $attr = $3; $attr =~ s/^\s*|\s*$//g; next if not defined $attr or $attr =~ /^\s*$/; $attributes{$type} = $attr if length $attr; } return {%attributes}; } ################################# sub date_time_stamp { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my ($date, $time); $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; return ($date, $time); } ################################# sub print_document_data { my ($type, $documents) = @_; my ($doc_id, $doc); print "\n======== $type documents ========\n\n"; foreach $doc_id (sort keys %$documents) { $doc = $documents->{$doc_id}; print "doc ID=$doc_id, "; print "TYPE=$doc->{TYPE}, "; print "FILE=$doc->{FILE}\n"; } } ################################# sub print_entity_data { my ($type, $db) = @_; my ($entity_id); print "\n======== $type entities ========\n\n"; foreach $entity_id (sort keys %{$db->{entities}}) { print_entity ($db->{entities}{$entity_id}); } } ################################# sub print_entity { my ($entity) = @_; my ($name, $title, $doc, $doc_info, $attribute, $mention); my $total_value; foreach $doc (keys %{$entity->{documents}}) { my $doc_entity = $entity->{documents}{$doc}; ($doc_entity->{VALUE}) = entity_document_value($doc_entity); $total_value += $doc_entity->{VALUE}; } printf "entity ID=$entity->{ID}, VALUE=%.5f", $total_value; foreach $attribute (@entity_attributes) { next if $attribute eq "ID"; print ", $attribute=$entity->{$attribute}" if defined $entity->{$attribute}; } print "\n"; foreach $name (@{$entity->{names}}) { print " name=\"$name\"\n"; } foreach $title (@{$entity->{titles}}) { print " title=\"$title\"\n"; } foreach $doc (sort keys %{$entity->{documents}}) { $doc_info = $entity->{documents}{$doc}; print " -- in document $doc\n"; foreach $mention (sort sort_on_locator @{$doc_info->{mentions}}) { print " mention TYPE=$mention->{TYPE}, ROLE=$mention->{ROLE}, STYLE=$mention->{STYLE}, "; printf "head=\"%s\", ", defined $mention->{head}{text} ? $mention->{head}{text} : "???"; printf "extent=\"%s\"\n", defined $mention->{extent}{text} ? $mention->{extent}{text} : "???"; } foreach $name (sort sort_on_locator @{$doc_info->{names}}) { printf " name extent=\"%s\"\n", defined $name->{text} ? $name->{text} : "???"; } } } ################################# sub print_relation_data { my ($type, $db) = @_; my ($relation_id, $relation, $arg, $time); my ($doc_id, $doc, $doc_info); my ($attribute, $mention, $narg, $entity); print "\n======== $type relations ========\n\n"; foreach $relation_id (sort keys %{$db->{relations}}) { $relation = $db->{relations}{$relation_id}; print "relation ID=$relation->{ID}"; foreach $attribute (@relation_attributes) { next if $attribute eq "ID"; print ", $attribute=$relation->{$attribute}" if defined $relation->{$attribute}; } print "\n"; foreach $time (@{$relation->{times}}) { printf " time info:%s\n", relation_time_info($time); } $narg=0; foreach $arg (@{$relation->{arguments}}) { $entity = $db->{entities}{$arg->{ID}}; print " ".relation_arg_description (++$narg, $entity); } print_relation_mentions ($relation->{documents}); } } ################################# sub get_database { my ($file) = @_; my ($line, $data, $tag, $span); my (%entity, $ndb_entities, $attribute); #read data from file open (FILE, $file) or die "\nUnable to open ACE database file '$file'"; while ($line=){ $data .= $line; } close (FILE); #get entity data for all entities in the database file while (($tag, $span, $data) = extract_sgml_tag_and_span ("entity", $data)) { foreach $attribute (@entity_attributes) { next if $attribute eq "ORIGIN"; $entity{$attribute} = extract_sgml_tag_attribute ($attribute, $tag); } $entity{CLASS} = "SPECIFIC" unless defined $entity{CLASS}; $entity{names} = [get_entity_attribute_vector ($span, "name")]; $entity{titles} = [get_entity_attribute_vector ($span, "title")]; $entity{ID} or die "\n\nFATAL INPUT ERROR: no entity ID found in tag '$tag' in database file '$file'\n\n"; $entity{TYPE} or die "\n\nFATAL INPUT ERROR: no entity TYPE found in tag '$tag' in database file '$file'\n\n"; not $ref_database{entities}{$entity{ID}} or die "\n\nFATAL INPUT ERROR: entity ID '$entity{ID}' in database file '$file' is not unique\n\n"; ($entity{TYPE}) = normalize_entity_type($entity{TYPE}); ($ref_database{entities}{$entity{ID}}) = {%entity}; $ref_database{entities}{$entity{ID}}{ORIGIN} = "DATABASE"; #flag this entity as a database entity $ref_database{entities}{$entity{ID}}{VALUE} = entity_value($ref_database{entities}{$entity{ID}}, $ref_database{entities}{$entity{ID}}); $ndb_entities++; } $ndb_entities or warn "\n\nWARNING: database file '$file' contains no entities\n\n"; } ################################# sub normalize_entity_type { my ($type) = @_; $type =~ s/^\s*|\s*$//g; return $normalize_entity_type{substr($type,0,3)}; } ################################# sub normalize_entity_class { my ($class) = @_; $class =~ s/^\s*|\s*$//g; return $normalize_entity_class{substr($class,0,3)}; } ################################# sub get_entity_attribute_vector { my ($entity_data, $attribute) = @_; my $span; my (%attribute, @attributes); my ($tag, $data); ($span) = extract_sgml_span ("entity_attributes", $entity_data); while ($span and (($tag, $data, $span) = extract_sgml_tag_and_span ($attribute, $span))) { push @attributes, $data; } return @attributes; } ################################# sub weighted_bipartite_graph_matching { my ($score) = @_; my $INF = 1E30; my (@row_mate, @col_mate, @row_dec, @col_inc); my (@parent_row, @unchosen_row, @slack_row, @slack); my ($k, $l, $row, $col, @col_min, $cost, %cost); my $t = 0; unless (defined $score) { warn "input to BGM is undefined\n"; return undef; } return {} if (keys %$score) == 0; my @rows = sort keys %{$score}; my $miss = "miss"; $miss .= "0" while exists $score->{$miss}; my (@cols, %cols); my $min_score = $INF; foreach $row (@rows) { foreach $col (keys %{$score->{$row}}) { $min_score = min($min_score,$score->{$row}{$col}); $cols{$col} = $col; } } @cols = sort keys %cols; my $fa = "fa"; $fa .= "0" while exists $cols{$fa}; my $reverse_search = @rows < @cols; # search is faster when ncols <= nrows foreach $row (@rows) { foreach $col (keys %{$score->{$row}}) { ($reverse_search ? $cost{$col}{$row} : $cost{$row}{$col}) = $score->{$row}{$col} - $min_score; } } push @rows, $miss; push @cols, $fa; if ($reverse_search) { my @xr = @rows; @rows = @cols; @cols = @xr; } my $nrows = @rows; my $ncols = @cols; my $nmax = max($nrows,$ncols); my $no_match_cost = -$min_score*(1+$required_precision); # subtract the column minimas for ($l=0; $l<$nmax; $l++) { $col_min[$l] = $no_match_cost; next unless $l < $ncols; $col = $cols[$l]; foreach $row (keys %cost) { next unless defined $cost{$row}{$col}; my $val = $cost{$row}{$col}; $col_min[$l] = $val if $val < $col_min[$l]; } } # initial stage for ($l=0; $l<$nmax; $l++) { $col_inc[$l] = 0; $slack[$l] = $INF; } ROW: for ($k=0; $k<$nmax; $k++) { $row = $k < $nrows ? $rows[$k] : undef; my $row_min = $no_match_cost; for (my $l=0; $l<$ncols; $l++) { my $col = $cols[$l]; my $val = ((defined $row and defined $cost{$row}{$col}) ? $cost{$row}{$col}: $no_match_cost) - $col_min[$l]; $row_min = $val if $val < $row_min; } $row_dec[$k] = $row_min; for ($l=0; $l<$nmax; $l++) { $col = $l < $ncols ? $cols[$l]: undef; $cost = ((defined $row and defined $col and defined $cost{$row}{$col}) ? $cost{$row}{$col} : $no_match_cost) - $col_min[$l]; if ($cost==$row_min and not defined $row_mate[$l]) { $col_mate[$k] = $l; $row_mate[$l] = $k; # matching row $k with column $l next ROW; } } $col_mate[$k] = -1; $unchosen_row[$t++] = $k; } goto CHECK_RESULT if $t == 0; my $s; my $unmatched = $t; # start stages to get the rest of the matching while (1) { my $q = 0; while (1) { while ($q < $t) { # explore node q of forest; if matching can be increased, update matching $k = $unchosen_row[$q]; $row = $k < $nrows ? $rows[$k] : undef; $s = $row_dec[$k]; for ($l=0; $l<$nmax; $l++) { if ($slack[$l]>0) { $col = $l < $ncols ? $cols[$l]: undef; $cost = ((defined $row and defined $col and defined $cost{$row}{$col}) ? $cost{$row}{$col} : $no_match_cost) - $col_min[$l]; my $del = $cost - $s + $col_inc[$l]; if ($del < $slack[$l]) { if ($del == 0) { goto UPDATE_MATCHING unless defined $row_mate[$l]; $slack[$l] = 0; $parent_row[$l] = $k; $unchosen_row[$t++] = $row_mate[$l]; } else { $slack[$l] = $del; $slack_row[$l] = $k; } } } } $q++; } # introduce a new zero into the matrix by modifying row_dec and col_inc # if the matching can be increased update matching $s = $INF; for ($l=0; $l<$nmax; $l++) { if ($slack[$l] and ($slack[$l]<$s)) { $s = $slack[$l]; } } for ($q = 0; $q<$t; $q++) { $row_dec[$unchosen_row[$q]] += $s; } for ($l=0; $l<$nmax; $l++) { if ($slack[$l]) { $slack[$l] -= $s; if ($slack[$l]==0) { # look at a new zero and update matching with col_inc uptodate if there's a breakthrough $k = $slack_row[$l]; unless (defined $row_mate[$l]) { for (my $j=$l+1; $j<$nmax; $j++) { if ($slack[$j]==0) { $col_inc[$j] += $s; } } goto UPDATE_MATCHING; } else { $parent_row[$l] = $k; $unchosen_row[$t++] = $row_mate[$l]; } } } else { $col_inc[$l] += $s; } } } UPDATE_MATCHING: # update the matching by pairing row k with column l while (1) { my $j = $col_mate[$k]; $col_mate[$k] = $l; $row_mate[$l] = $k; # matching row $k with column $l last UPDATE_MATCHING if $j < 0; $k = $parent_row[$j]; $l = $j; } $unmatched--; goto CHECK_RESULT if $unmatched == 0; $t = 0; # get ready for another stage for ($l=0; $l<$nmax; $l++) { $parent_row[$l] = -1; $slack[$l] = $INF; } for ($k=0; $k<$nmax; $k++) { $unchosen_row[$t++] = $k if $col_mate[$k] < 0; } } # next stage CHECK_RESULT: # rigorously check results before handing them back for ($k=0; $k<$nmax; $k++) { $row = $k < $nrows ? $rows[$k] : undef; for ($l=0; $l<$nmax; $l++) { $col = $l < $ncols ? $cols[$l]: undef; $cost = ((defined $row and defined $col and defined $cost{$row}{$col}) ? $cost{$row}{$col} : $no_match_cost) - $col_min[$l]; if ($cost < ($row_dec[$k] - $col_inc[$l])) { next unless $cost < ($row_dec[$k] - $col_inc[$l]) - $required_precision*max(abs($row_dec[$k]),abs($col_inc[$l])); warn "BGM: this cannot happen: cost{$row}{$col} ($cost) cannot be less than row_dec{$row} ($row_dec[$k]) - col_inc{$col} ($col_inc[$l])\n"; return undef; } } } for ($k=0; $k<$nmax; $k++) { $row = $k < $nrows ? $rows[$k] : undef; $l = $col_mate[$k]; $col = $l < $ncols ? $cols[$l]: undef; $cost = ((defined $row and defined $col and defined $cost{$row}{$col}) ? $cost{$row}{$col} : $no_match_cost) - $col_min[$l]; if (($l<0) or ($cost != ($row_dec[$k] - $col_inc[$l]))) { next unless $l<0 or abs($cost - ($row_dec[$k] - $col_inc[$l])) > $required_precision*max(abs($row_dec[$k]),abs($col_inc[$l])); warn "BGM: every row should have a column mate: row $row doesn't, col: $col\n"; return undef; } } my %map; for ($l=0; $l<@row_mate; $l++) { $k = $row_mate[$l]; $row = $k < $nrows ? $rows[$k] : undef; $col = $l < $ncols ? $cols[$l]: undef; next unless defined $row and defined $col and defined $cost{$row}{$col}; $reverse_search ? ($map{$col} = $row) : ($map{$row} = $col); } return {%map}; }