By PhDs, (© Copyright reserved),
March 10, 2010 at 11:49 am
· Category: Perl
This is my http log file parse. It will tell you who is crawling your site. Hope you like it
#!/usr/local/perl
use strict;
my @files=<*_access.log*>;
my %address=();
my %agents=();
foreach my $file (@files)
{
my $ggl=0;
my $yahoo=0;
my $cuil=0;
my $twiceler=0;
my $Jeeves=0;
my $Yandex=0;
my $legs=0;
my $Baiduspider=0;
my $dotnetdotcom=0;
my $msn=0;
my $seoprofiler=0;
open (IN, “$file”);
my $outfile=”$file.txt”;
open (OUT, “>$outfile”);
while (<IN>)
{
my $orig_line=$_;
my ($line) = $orig_line;
#$line=~ s/\///g;
my @arr= split (/\”/, $line);
my $ip=shift(@arr);
my $agent=pop(@arr);
$agent=pop(@arr);
@arr=split(/ /,$ip);
$ip=shift(@arr);
if (exists $address{$ip})
{
my $count= $address { $ip } ;
$count++;
# print $ip .” “.$count.”\n”;
$address{$ip}=$count;
}
else
{
$address{$ip}=1;
$agents{$ip}=$agent;
}
if ($line =~ /twiceler/ or $line =~ /cuil\.com/ or $line =~ /Yahoo\! Slurp/ or $line =~ /Googlebot/ or $line =~ /Ask Jeeves/ or
$line =~ /Yandex/ or $line =~ /80legs/ or $line =~ /Baiduspider/ or $line =~ /dotnetdotcom/ or $line=~ /seoprofiler/
or $line=~ /msn.com/
)
{
# print $line .”\n”;
if ($line =~ /msn\.com/)
{
$msn++;
}
if ($line =~ /twiceler/)
{
$twiceler++;
}
if ($line =~ /cuil\.com/)
{
$cuil++;
}
if ($line =~ /Yahoo\! Slurp/)
{
$yahoo++;
}
if ($line =~ /Googlebot/)
{
$ggl++;
}
if ($line =~ /Ask Jeeves/)
{
$Jeeves++;
}
if ($line =~ /Ask Jeeves/)
{
$Yandex++;
}
if ($line =~ /80legs/)
{
$legs++;
}
if ($line =~ /Baiduspider/)
{
$Baiduspider++;
}
if ($line =~ /dotnetdotcom/)
{
$dotnetdotcom++;
}
if ($line =~ /seoprofiler/)
{
$seoprofiler++;
}
}
else
{
print OUT $orig_line;
}
}
# unlink ($file);
# rename ($outfile, $file);
print “google: $ggl, Yahoo: $yahoo, Cuil: $cuil, twiceler: $twiceler, Jeeves: $Jeeves, Yandex: $Yandex, legs: $legs\n”;
print “Baiduspider: $Baiduspider, dotnetdotcom: $dotnetdotcom, msn: $msn, seoprofiler: $seoprofiler \n”;
}
open (OUT, “>grant.txt”);
foreach my $key (sort hashValueDescendingNum (keys(%address)))
{
print OUT “$address{$key} \t $key\t $agents{$key}\n”;
}
close(OUT);
sub hashValueAscendingNum {
$address{$a} <=> $address{$b};
}
sub hashValueDescendingNum {
$address{$b} <=> $address{$a};
}

Permalink
By PhDs, (© Copyright reserved),
February 14, 2010 at 11:39 am
· Category: Perl
PubMed
NLM currently leases PubMed journal citations, at no charge.
Other Entrez Databases
Email Address
If you choose to provide an email address we will use it to contact you if there are problems with your queries or if we are changing software interfaces that might specifically affect your requests. If you choose not to include an email address you can sign up for utilities-announce to receive general announcements.
Example: email=name@institution.org
Display Formats
Display Formats for Sample Entrez Databases
Note: Scripts/programs that import XML should use E-Utilities.
| Database |
Display Format |
|
| PubMed |
DocSum, Abstract, MEDLINE, XML |
|
| Nucleotide |
DocSum, Brief, GenBank, ASN1, FASTA, ExternalLink, XML, Graph, fasta_xml, gbc_xml |
|
| OMIM |
Detailed, Synopsis, Variants, ASN1, XML, ExternalLink |
|
| Gene |
DocSum, Full_Report, ASN1, XML, Gene_Table, ExternalLink |
|
| Protein |
DocSum, Brief, GenPept, ASN1, FASTA, ExternalLink, XML, graph, fasta_xml, gpc_xml |
|
| Genome |
DocSum, Brief, ASN1, ExternalLink, XML, Protein Table, cDNA FASTA, Protein FASTA, Structural RNA Table, Contig Table |
|
| Structure |
DocSum, Brief |
|
| PopSet |
DocSum, Brief, ASN1, ExternalLink |
|
| Taxonomy |
DocSum, Brief, TxUidList, TxInfo, TxTree, ExternalLink, XML |
|
Permalink
By PhDs, (© Copyright reserved),
March 7, 2009 at 12:32 pm
· Category: Perl
Use this Perl Script
#!/usr/bin/perl
# load LWP library:
use LWP::UserAgent;
use HTML::Parse;
# define a URL
my $url = ‘https://www.cnn.com/’;
# create UserAgent object
my $ua = new LWP::UserAgent;
# set a user agent (browser-id)
# $ua->agent(‘Mozilla/5.5 (compatible; MSIE 8; Windows NT 5.1)’);
# timeout:
$ua->timeout(15);
# proceed the request:
my $request = HTTP::Request->new(‘GET’);
$request->url($url);
my $response = $ua->request($request);
#
# responses:
#
# response code (like 200, 404, etc)
my $code = $response->code;
# headers (Server: Apache, Content-Type: text/html, …)
my $headers = $response->headers_as_string;
# HTML body:
my $body = $response->content;
# print the website content:
# print $body;
# do some parsing:
my $parsed_html = HTML::Parse::parse_html($body);
for (@{ $parsed_html->extract_links(qw(a body img)) }) {
# extract all links (a, body, img)
my ($link) = @$_;
print $link . “\n”;
}
Permalink