#!/perl/bin use strict; # tracedomains.pl - Version 1.1 (Aug 3th, 1998) # # Perl-Script by Kajetan Hinner. This program is Mailware, every user should write to Kajetan@Hinner.com # tracedomains.pl analyzes the actual plaze where a domain is hosted. it creates reports, which # can be used in Excel or SPSS for further analysis. # tracedomains.pl relies heavily on traceroute. traceroute's output is not checked at all. So please # refer to its manual for further reading on abilities and constraints of traceroute. # Also note that there is no taint checking AT ALL. This could be a security problem, if you don't # know the quality of input data fed into tracedomains.pl. If you think about doing interactive # domain tracing, check the supplied data for special characters. # # # # tracedomains.pl was developed using Linux 2.1.x, ircII 4.4, Perl 5.004 and KDE 4. As editor for large files I recommend nedit. # # How this script works: # # First www. is input for the traceroute command. If there is no output, just # is used. If this still gives no output, the next domain will be tried. # All the output lines from the traceroute command are analyzed, beginning with the last line. # If there is any match between the fully qualified domain name and one entry of the tracedomains.data # file, the suitable country code is taken from this file and stored in a hash. More additional # information is calculated, like round trip time, from the last entry. If there is no match, # the next line (counted from the end) is taken and searched. # You will easily understand, that it's important to be very careful when editing the tracedomains.data # file. Just enter domain names where you are quite sure where they are located, i.e. when the # domain name contains the name of a city or country. # # # # Author: Kajetan Hinner, University of Rostock, Sociology Dep., Kajetan@Hinner.com # # Notice: My aim was to write a working program which is reliable and easy to maintain. It was not intended to write # state of the art perl code, which may be hard to read for others. Some remarks are in german. Sorry for that. # I know that it's unwise to use traceroute that extremely, but I saw no other possibility. # If you want to adapt this script for another country, search for ***. Thus marked lines # should be looked at and maybe changed. # use Time::Local; # use Date::Manip; my $zeile = 0; # Actual string of input file my $lineno = 0; # line number of input file my $domain_org = $zeile; # save domain name, i.e. 'zzz.de' my $tldomain; # Top Level Domain my $fqdn; # fully qualified domain name; extracted from traceroute, used everywhere my $search; # item, scalar my @tracedomain; # list; contains output of traceroute command for specific domain my %data; # stores all information about processed domains # i.e. $data{"zzz.de"}{"status"}="unreachable" # other known entries: "hopcount", "rttmean" # "pattern": search pattern which matched for the country-search # "country": estimated country the server is located # "tld": last readable fqdn tld of traceroute output # "traceroute": output of traceroute command # "comment": printed in logfile. my %guesscountry; # contains certain domain names (organizations) and their countries # read from a data file which is supplied with tracedomains.pl for .de my %valid; # lines for which guesscountry entry is valid (counting from tail) # my $counter = 0; # sub # data file must be like # de 0 \.de.mci.net # de: country # valid-tag (0 everywhere, #: lines counting from tail) # escaped ., rest. sub read_infile { my $data_in = shift; my @linesread; print ("reading from file $data_in, input was @_\n"); open(IN, $data_in) or die "could not read $data_in: $!\n"; @linesread = ; close IN; die "no domains to read in $data_in\n" unless @linesread; foreach (@linesread) { chop; # get rid of \n next if /^#/; # skip comments next if /^\s*$/; # skip whitespace s/\s$//; # get rid of trailing blanks /^(\S+)\s+(\d+)\s+(\S+)$/; # separate fields print ("Domain: $3, valid $2, country $1\n"); $guesscountry{$3}=$1; $valid{$3}=$2; } } # first job: scanning for arguments # scan @ARGV my $self = $0; # Name of perl-script $self =~ s!^.*/!!; if (@ARGV != 1) { die "Usage: perl $self datafile(.log)\n"; } # my $filevar ="Domains.txt"; my $filevar = $ARGV[0]; my $domain_outfile = $filevar . ".out"; my $error_outfile = $filevar . ".error"; my $log_outfile = $filevar . ".log"; $self =~ /(\S*).pl/; # i.e. tracedomains.data my $data_infile = $1 . ".data"; read_infile($data_infile); # read guesscountry hash from file open (FILE, "<".$filevar) or die ("Could not open".$filevar.": $!\n"); open (ERROROUT, "> $error_outfile") or die ("Can't open $error_outfile : $!"); open (LOGOUT, "> $log_outfile") or die ("Can't open $log_outfile : $!"); while ($zeile = ) # get next line, lower case { $lineno++; # increment line counter printf ( "'$zeile'\n"); next if ($zeile =~ /^\s*#/); # skip comments and empty lines next if ($zeile =~ /^\s*$/); # First of all we need to strip all leading (and maybe following) blanks $zeile =~ s/^\s+//; $zeile =~ s/\s$//; # printf ( "without white space: '$zeile'\n"); # Following tests if it could be a domain name or not. so we get the last four characters # and search for a "." in it if ((my $domain_tld) = ($zeile =~ /^.*\.(\S{2,4}?)$/)) { # $domain_org = $zeile; # save domain name, i.e. 'zzz.de', just after we know if # www.domain.tld or domain.tld gains results. # ok, now starting a traceroute. first try with "www" as praefix, # if this yields no output, try plain domain name @tracedomain = qx { traceroute "www.$zeile" }; $domain_org = "www.$zeile"; if (!@tracedomain) {@tracedomain = qx { traceroute "$zeile"}; $domain_org = $zeile;}; $data{$domain_org}{'traceroute'}= join("", @tracedomain) ; # save traceroute output # print ("output of traceroute: '@tracedomain'\n"); my $tdline=@tracedomain; # length of traceroute-output # now we need to get line-by-line from the end and check for a good if ($tdline) { my $rtt1; my $rtt2; my $rtt3; # roundtriptime for each packet traceroute prints my $rttmean=0; my $rttcount=0; # mean of max. three round-trip-time values; counter for values # $fqdn: # fully qualified domain name; extracted from traceroute my $hopcount = $tdline; # no. of hops; will be decreased later if "* * *" happened... # sample traceroute output: # merit.WillowSprings.mci.net (166.48.23.254) 200.436 ms 198.885 ms 209.062 ms # examining the last entry my $traceline = lc(@tracedomain[$tdline-1]); print ("Last line $tdline of traceroute '$traceline'\n"); if (($fqdn, $rtt1, $rtt2, $rtt3) = ($traceline =~ /^ ?\d+ [\* ]{0,4} (\S+) $.+$ (.+ ms|\*) (.+ ms|\*) (.+ ms|\*).*/)) { # calculate mean of round-trip-time, using only valid input (note: * also possible!) if ($rtt1 =~ /(.+) ms/) { $rttmean += $1; $rttcount++; } if ($rtt2 =~ /(.+) ms/) { $rttmean += $1; $rttcount++; } if ($rtt3 =~ /(.+) ms/) { $rttmean += $1; $rttcount++; } # print ("rttsum: $rttmean; rttcount: $rttcount\n"); if ($rttcount) {$rttmean /= $rttcount; } else {$rttmean =0}; printf ("Fully qualified domain name: '$fqdn', packets: $rtt1, $rtt2, $rtt3, rtt-mean: %.2f \n", $rttmean); if ($rttmean == 0) {$rttmean =""}; # don't accept 0-values $data{$domain_org}{'rttmean'}=sprintf("%.2f", $rttmean); # save in hash $data{$domain_org}{'status'}="ok"; $data{$domain_org}{'hopcount'}=$hopcount; } else { $data{$domain_org}{'status'}="reach?"; $data{$domain_org}{'hopcount'}="\>$hopcount"; $data{$domain_org}{'rttmean'}=""; }; # now searching for 'country'. which hops can be identified for their location SEARCHTLD: while ($tdline--) { $traceline = lc(@tracedomain[$tdline]); # reg exp: match several digits, then 0-6 characters * or space, then non-whitespace, # then ( and several characters followed by ) if (($fqdn) = ($traceline =~ /^ ?\d+ [* ]{0,6}(\S+) $.+$/)) { print ("Line $tdline: '$traceline', fqdn: '$fqdn'\n"); # now searching for destination traceroute leads to if (!($data{$domain_org}{'country'})) { # extract tld # extracting topleveldomain (last occurence of .something) $fqdn =~ /\S+\.(\S+)$/; my $tld = $1; # top level domain print ("tld: '$tld' and fqdn: $fqdn\n"); if ($tld =~ /\D/) { # tld consists only of digits? (if so, skip) # if top level domain entry for this domain is not set, set it. # the difference to the following is: entries like .net or .com will # be in the "tld" field, but - if possible- not in the "country" field. # there we try to find out the country using names like cities or whatever if (! ($data{$domain_org}{'tld'})) { $data{$domain_org}{'tld'} = $tld; }; my $country; # destination country of traceroute SEARCHCOUNTRY: foreach $search (keys %guesscountry) { # print ("actual: '$search'\n"); if (($fqdn =~ /$search/)) { # number of lines for which guesscountry entry is valid my $valid=$valid{$search}; # read value entry if ($valid && ($valid < (@tracedomain - $tdline))) { my $adjustedline = $tdline+1; $data{$domain_org}{'comment'} .= "# Possible match, but invalid entry for $fqdn, searched for $search, valid $valid at line $adjustedline\n"; next SEARCHCOUNTRY; # continue searching } $country = $guesscountry{$search}; print ("found entry for '$fqdn' in guesscountry, $search: country=$country\n"); $data{$domain_org}{'pattern'} = $search; $data{$domain_org}{'match'} = $fqdn; # ok. now fill the hash if (! ($data{$domain_org}{'country'})) { $data{$domain_org}{'country'} = $country; last SEARCHCOUNTRY; } } # end if entry was found } # continue looping over foreach elements... } # end if plain ip number } # end if country field was not set yet } # end if actual line contains valid traceroute output else { print (" no match: '$traceline'\n"); # this should not happen. reg exp. should be revised then } if (($data{$domain_org}{'country'}) && ($data{$domain_org}{'tld'})) { # print ("while can be ended!\n"); last; # } } # end of while if (! ($data{$domain_org}{'country'})) { (print ERROROUT "Could not resolve $domain_org:\n @tracedomain\n\n\n") or die "Can't write to $domain_outfile: $!"; } } # end of if traceroute-output was valid else {$data{$domain_org}{'status'}="no_conn"}; } # end of if - processed if line contains valid domain name printf ("right now I was at $lineno and checking '$zeile'\n"); } close(FILE); # Writing output files; all domains, their hops and mean round-trip-time, country and tld # output is reversed because of long domain names and formatting... so first are # country (guess) | tld | number of hops | mean round trip time | domain-name open (DOMAINOUT, "> $domain_outfile") or die "Can't open $domain_outfile : $!"; (print DOMAINOUT "Status\tCountry\tTLD\tHops\trt-time\tDomain\n") or die "Can't write to $domain_outfile: $!"; my $status; # domain reachable or not? my $country; # guess of destination country my $tld; # top level domain (was last tracerout output with tld containing alphanumeric) my $hopcount; # number of hops my $rttmean; # round-trip-time, mean my $fqdn; # fully qualified domain name foreach $fqdn (sort keys %data) { $status = $data{$fqdn}{'status'}; $country = $data{$fqdn}{'country'}; $tld = $data{$fqdn}{'tld'}; $hopcount = $data{$fqdn}{'hopcount'}; $rttmean = $data{$fqdn}{'rttmean'}; (print DOMAINOUT "$status\t$country\t$tld\t$hopcount\t$rttmean\t$fqdn\n") or die "Can't write to $domain_outfile: $!"; # same information, but with additional traceroute information (print LOGOUT "$status\t$country\t$tld\t$hopcount\t$rttmean\t$fqdn\n") or die "Can't write to $log_outfile: $!"; (print LOGOUT "# Matched '$data{$fqdn}{'match'}' with pattern: '$data{$fqdn}{'pattern'}\n") or die "Can't write to $log_outfile: $!"; (print LOGOUT "$data{$fqdn}{'comment'}") or die "Can't write to $log_outfile: $!"; (print LOGOUT "Traceroute output of $fqdn\n") or die "Can't write to $log_outfile: $!"; (print LOGOUT "$data{$fqdn}{'traceroute'}\n\n\n") or die "Can't write to $log_outfile: $!"; } close (DOMAINOUT); close (ERROROUT); close (LOGOUT);