#!/usr/bin/perl ## tkooda : 2005-01-27 : domains-from-rfc822 : v0.0.1 ## http://devsec.org/software/misc/domains-from-rfc822 ## USAGE: ## domains-from-rfc822 < rfc822.msg 2>/dev/null > domains.txt ## ## ABOUT: ## - print (via STDOUT) the valid (via regex, no DNS lookup) domain ## names from an rfc822 email message body (via STDIN) ## ## NOTES: ## - will print "bar.com" instead of "http://foo.bar.com" ## - will print "bar.co.uk" instead of "foo.bar.co.uk" ## - will recurse through multipart/nested mime parts/attachments ## - for usage with a rhsbl ## - Mail::Message->decoded() might print warnings to STDERR on invalid types use Mail::Message; use URI::Find::Schemeless; use URI::Split qw(uri_split); use Mail::SpamAssassin::Util::RegistrarBoundaries qw( is_domain_valid, trim_domain ); use strict; my $msg = Mail::Message->read( \*STDIN ); handle_body( $msg ); exit; sub handle_body { my ( $msg ) = @_; if ( ! $msg->isMultipart ) { handle_part( $msg ) if ( $msg->decoded->type =~ m/^(text|message)/i ); return; } foreach my $part ( $msg->decoded->parts ) { handle_body( $part ); # recurse for nested attachment parts } } sub handle_part { my ( $msg ) = @_; my @uris; my $finder = URI::Find::Schemeless->new( sub { push @uris => $_[0] } ); $finder->find( \$msg->decoded ); foreach my $uri ( @uris ) { my ($scheme, $auth, $path, $query, $frag) = uri_split( $uri ); if ( $scheme =~ m/^(http|ftp)/i ) { # use SpamAssassin to only print valid zones (and no IPs) my $domain = Mail::SpamAssassin::Util::RegistrarBoundaries::trim_domain( $auth ); print "$domain\n" if ( Mail::SpamAssassin::Util::RegistrarBoundaries::is_domain_valid( $domain ) ); } } }