#!/usr/bin/perl -w
my $url_volume_list = "http://www.sciencedirect.com/science?_ob=JournalURL&_cdi=6854&_auth=y&_acct=C000058412&_version=1&_urlVersion=0&_userid=2640392&md5=83ef29225305a65e570f317813d45609";
use LWP::Simple;
my $content = get $url_volume_list;
die "Couldn't get connect to list of volumes on jct website " unless defined $content;
my $vol_num="36";
my $iss_num="10";
my $pos1=0;
my $pos2=0;
my $tablecount=0;
my $iss_title="";
my $iss_year="";
my $iss_month="";
my $curpath=`pwd`;
$curpath =~ s/^\s*(\S+)\s*$/$1/s;
$curpath =~ s{(/[^/]+?/[^/]+$)}{$1}s;
$curpath = $1;
$curpath =~ m{/[0-9]+?v([0-9]+)/i([0-9]+)}s;
$vol_num=$1;$iss_num=$2;
if ( @ARGV ) {
die "Usage Proc2.pl with no arguments, the vol. number and issue number are taken from the path!!";
}
# Retrieve the link of interest from JCT
#get the url for this issue out of the volume list page
$content =~ m{HREF="([^"]*?Volume_$vol_num,_Issue_$iss_num.*?md5=[0-9,a-z]+)"}gs;
my $url2 = "http://www.sciencedirect.com" . $1;
my $content2 = get $url2;
die "Couldn't get data for that volume " unless defined $content2;
# Clean the file a little first
#Remove all Scripts (assuming no scripts within scripts
$content2 =~ s/<(script)[^>]*?>.*?<\/\1[^>]*?>//sig ;
#get the page numbers, month and year for this issue, all available in the title
$content2 =~ m{
]*?>(.*?)]*?>}si;
$iss_title = $1;
$iss_title =~ m/Volume.*$/si ;
$iss_title = $&;
$iss_title =~ m/\((.*?)\)/si;
$1 =~ m/(\w+)\s+?(\S+)/si;
$iss_month = $1;
$iss_year = $2;
#The articles are in a series of tables identifiable by class="tableResults-B"
#Extract them all
pos $content2 = 0;
my $num_articles = 0;
my @ArticleTables = ();
my @Art_DOI = ();
my @Art_type =(); #Article or Review or Editor's Note ...
my @Abstract_ref = ();
my @Authors =();
my @Title = ();
my @Pages = ();
my @Art_xmlfile = ();
my @ThermoML_fname = ();
my @ThermoML_trcref = ();
my @ThermoML_pages = ();
my @ThermoML_vol = ();
my $ThermoML_count = 0;
my @table_array;
my @filelist = `ls *.xml`;
chop @filelist;
my @citation = ();
my $filenum = 0;
for $thisfile (@filelist)
{
open XMLFILE,"<$thisfile" or die " I couldn't open $thisfile\n";
$curcitation ="";
$line1="";
($line1 = ) until $line1 =~ m// ;
$curcitation = $line1;
$curcitation .= ($line1 = ) until ($line1 =~ m/<\/Citation>/);
close XMLFILE;
if ($curcitation =~ m/(.*?)<\/TRCRefID>/s)
{
$_=$1;
$trcref="";
m/(.*?)<\/yrYrPub>/s;
$trcref.=$1;
m/(.*?)<\/sAuthor1>/s;
$trcref .= $1;
m/(.*?)<\/sAuthor2>/s;
$trcref .= $1;
m/(.*?)<\/nAuthorn>/s;
$trcref .= $1;
if ($trcref)
{
$ThermoML_count++;
$ThermoML_fname[$ThermoML_count-1] = $thisfile;
$ThermoML_trcref[$ThermoML_count-1]=$trcref;
# while ($curcitation =~ m/([^<]*?)<\/sAuthor>/gs) {print "$1 \n";}
$curcitation =~ m/([^<]*?)<\/sPage>/s ;
$ThermoML_pages[$ThermoML_count-1]=$1;
$curcitation =~ m/([^<]*?)<\/sVol>/s;
$ThermoML_vol[$ThermoML_count-1]=$1;
}
}
}
while ( $content2 =~ m/]*?tableResults-B[^>]*?>/gsi)
{
$pos1 = (pos $content2) - length($&);
$pos2 = $pos1;
$tablecount = 1;
while ($tablecount != 0)
{
$content2 =~ m{<([/]?table)[^>]*?>}gsi;
$pos2 = pos $content2;
($1 =~ m{^table}i) ? $tablecount++ : $tablecount--;
}
$ArticleTables[++$num_articles] = substr($content2,$pos1,$pos2-$pos1);
pos $content2 = $pos2;
# Find content in this article
# first get table data item table 1, row 1 column 2
@table_array = table_to_array($ArticleTables[$num_articles]);
$table_array[1][2] =~ m/(.*?)
/gsi;
$Title[$num_articles] = $1;
$table_array[1][2] =~ m/(.*?)
/gsi;
$Pages[$num_articles] = $1;
$table_array[1][2] =~ m/(.*?)
/gsi;
$Authors[$num_articles] = $1;
$table_array[1][2] =~ m/(.*?)
/gsi;
$Abstract_ref[$num_articles] = $1;
#now get the abstract reference
$Abstract_ref[$num_articles] =~ m/href="(http:[^"]*?)"/gsi;
$url3 = $1;
$content3 = get $url3;
die "Couldn't get connect to list of volumes on jct website " unless defined $content;
$content3 =~ m{href="http://dx\.doi\.org/([^"]*?)"}gsi;
$Art_DOI[$num_articles] = $1;
}
#now see if xml files are available for each article
for (my $icount=1;$icount<=$num_articles;$icount++){
#first split the Authors
@author_array = split /,|and/,$Authors[$icount];
$trcname = "";
$cnt = 0;
while ($author_array[$cnt] and ($cnt < 2)){
# find last names
$author_array[$cnt++] =~ m{\s(\S+?)\s*?$};
# print "$1 \n";
$trcname .= lc substr($1,0,3);
}
$trcname .= $iss_year.$trcname;
$Art_xmlfile[$icount]="";
for (my $iloc=0;$iloc < $ThermoML_count;$iloc++)
{
$ThermoML_pages[$iloc] =~ s/^\s*(\S*)\s*$/$1/;
if ( $Pages[$icount] =~ m/$ThermoML_pages[$iloc]/s ){$Art_xmlfile[$icount]=$ThermoML_fname[$iloc];}
}
#generate citation only xml files for articles with no TRC provided files
$Art_DOI[$icount] =~ m/\/(\S+)/s;
$local_doi = $1."\.xml";
$Title[$icount] =~ m/\s*(.*?)\ /si;
$local_title = $1;
$Pages[$icount] =~ m/Pages\s*([0-9]+-[0-9]+)/si;
$local_pages = $1;
if (not $Art_xmlfile[$icount])
{
$Art_xmlfile[$icount] = "\./".$Art_DOI[$icount]."\.xml";
open XMLTAG,">$local_doi" or die "you can't open this file $local_doi \n";
print XMLTAG <<"ENDOFXMLHEAD";
2
0
journal
Original
ENDOFXMLHEAD
foreach $auth_name (@author_array)
{
$auth_temp = $auth_name;
my $first_name="";
my $last_name = "";
$auth_temp = HTML_to_text($auth_temp);
$auth_temp =~ s/^\s*(.*?)\s*$/$1/s; #trim spaces in front and back
$auth_temp =~ m/^(.*?)(\S+)$/;
$first_name =$1;
$last_name = $2;
$first_name =~ s/^\s*(.*?)\s*$/$1/s; #trim spaces in front and back
print XMLTAG "\t\t$last_name, $first_name\n";
}
$local_title = HTML_to_text($local_title);
print XMLTAG <<"ENDOFXMLTAIL";
J. Chem. Thermodyn.
$iss_year
$local_title
$vol_num
$local_pages
ENDOFXMLTAIL
close XMLTAG;
}
else
{
`ln -s $Art_xmlfile[$icount] $local_doi`;
}
# print "Article number $icount has xml file $Art_xmlfile[$icount] \n";
`ln -s ../$local_doi ./10.1016/$local_doi`;
}
print_head($vol_num,$iss_num,$iss_title,$url2);
#print tables
for (my $icount=1;$icount<=$num_articles;$icount++){
$Art_DOI[$icount] =~ m/\/(\S+)/s;
my $local_doi_xml = $1."\.xml";
$rowcolor ="";
$rowcolor=" bgcolor=\"#EDEDED\"" if ($icount % 2);
print <<"ENDOFTABLE";
|
$Title[$icount]
$Pages[$icount]
$Authors[$icount]
ThermoML Data (To download: right-click on link and select "Save Link Target As" )
|
ENDOFTABLE
#the following line adds a link via the DOI to the article abstract
#Link to Abstract on Journal Website (This will take you out of NIST web-space)
}
print "
\n";
print "