# Copyright Alan Jackson, 2004 # Ths product is protected under the GPL open source license. It may be used and modified, # but the source must remain public and this notice must be retained. # Input file looks like this (without the leading pound signs): # Date Folder Size IP Domain # 2004-05-31 Yorkshire 3.816 66.43.18.41 rootsweb.com # 2004-05-31 sword 4.992 64.140.154.250 crosswire.org # 2004-05-31 sword 4.014 64.140.154.250 crosswire.org # 2004-05-31 spam-l 2.72 209.119.0.109 lsoft.com # 2004-05-31 magdalen 2.665 192.147.236.25 herberthouse.org # 2004-05-31 Spamtest 4.455 222.98.248.221 kornet.net # 2004-05-31 autospam 2.413 24.199.125.9 mindspring.com # 2004-05-31 Spamtest 4.999 80.88.129.21 emperion.net # 2004-05-31 magdalen 4.67 192.147.236.25 herberthouse.org # 2004-05-31 Spamtest 3.32 68.196.135.83 optonline.net # 2004-05-31 magdalen 2.681 192.147.236.25 herberthouse.org # 2004-05-31 spam-l 2.676 209.119.0.109 lsoft.com # 2004-05-31 inbox 25.471 62.138.31.130 softmaker.com # 2004-05-31 spam-l 4.879 209.119.0.109 lsoft.com # 2004-05-31 Spamtest 3.914 24.90.215.52 rr.com # 2004-05-31 NovaScotia 2.898 66.43.18.39 rootsweb.com # 2004-05-31 sword 4.729 64.140.154.250 crosswire.org # 2004-05-31 magdalen 2.631 192.147.236.25 herberthouse.org # 2004-05-31 Spamtest 3.247 213.213.207.126 126.brutele.be # Types are L - mailing list, O = other, U - Unique IP, S - spam, I - inbox readdata <- function () { rawdata = read.table("/home/ajackson/var/log/Rdata", header=T); rawdata = subset(rawdata, rawdata$Date!="2003-10-08"); rawdata = subset(rawdata, rawdata$Date!="2003-10-09"); rawdata = subset(rawdata, rawdata$Folder!="aol"); types = levels(rawdata$Folder); t = "L" t = ifelse(types=="inbox","I",t); t = ifelse(types=="Abuse"|types=="Spammish"|types=="Starman","O",t); t = ifelse(types=="aol"|types=="aolspam","O",t); t = ifelse(types=="Spamtest"|types=="Virus","S",t); t = ifelse(types=="autospam"|types=="spam","S",t); t = ifelse(types=="joejob","S",t); t = ifelse(types=="plonk","S",t); t = ifelse(types=="UniqIP","U",t); types = cbind(types, t) data = merge(rawdata, types,by.x=2, by.y=1) data = cbind(data,strptime(as.vector(data$Date),format="%Y-%m-%d")) names(data) = c("Folder","inDate","Size","IP","Domain","Type","Date") data } plot1 <- function(data) { require(MASS) attach(data) byday = table(Date,Type,exclude=c("O","L","U","I")) ibyday = table(Date,Type,exclude=c("O","L","S","U")) ubyday = table(Date,Type,exclude=c("O","L","S","I")) keep=par(lty=1, pch=19, tck=1) plot(as.POSIXct(names(byday[,1])),byday[,1], xlab = "Date", ylab="Number per Day", main="Spam and Inbox Mail Received") points(as.POSIXct(names(ibyday[,1])),ibyday[,1],pch=19, col="red") points(as.POSIXct(names(ubyday[,1])),ubyday[,1],pch=19, col="blue") abline(lm(ibyday[,1]~as.POSIXct(names(ibyday[,1]))),col="red", lty=2) z1 = rlm(byday[,1]~as.POSIXct(names(byday[,1]))) abline(z1,col="black", lty=2) # loess loessline = loess.smooth(1:length(byday),byday)$y; xaxis = seq(from=as.numeric(as.POSIXct(names(byday[,1])[1])), to=as.numeric(as.POSIXct(names(byday[,1])[length(byday)])), length=length(loessline)); lines(xaxis, loessline, col="blue"); legend(as.POSIXct("2003-10-15"),320, legend=c("Spam and Viruses", "Inbox", "Unique IP"),col=c("black","red","blue"), pch=19, bg="white", lty=2 ) logbyday = log(byday[,1]+1) z2=rlm(logbyday~as.POSIXct(names(byday[,1]))) lines(as.POSIXct(names(byday[,1])),exp(z2$fitted.values)-1,lty=6, col="darkblue") legend(as.POSIXct("2003-10-15"),230, legend=c(paste("Linear fit, slope = ",round(z1$coefficients[2]*3600*24,digits=2)," spam/day"),paste("Exponential fit, doubling time = ",trunc((log(2)/z2$coefficients[2])/(3600*24)+.5), "days"),"Smoothed Loess Fit"),lty=c(2,6), col=c("black","darkblue","blue"),bg="white") #rect(as.POSIXct("2004-1-2"),55,as.POSIXct("2004-3-2"),65, col="white") #text(as.POSIXct("2004-1-1"),60,"CAN SPAM begins", pos=4) legend(as.POSIXct("2004-1-1"),60,legend="CAN SPAM begins",bg="white") par(keep) keep=par(xpd=NA) text(as.POSIXct("2003-10-01"),-50, label=paste("copyright Alan Jackson",substr(as.character(Sys.time()),1,10)),pos=4) par(keep) } ################## topten <- function (days=1) { extra = ""; if (days>0) { extra=paste("over the past ",days," days"); } if (days > 28) lab = "month" time = subset(data, julian(Sys.time())-julian(data$Date) max(rank(bydomain))-10); midpts = barplot(topten,main=paste("Top Spam domains ",extra), xlab="Number of Spam Received",horiz=T,axisnames=F,col="yellow"); pos=c() for (i in as.integer(topten)) { if ( i < mean(as.integer(topten))) { pos = c(pos,4)} else {pos=c(pos,2)} } text(as.integer(topten), midpts, labels=names(topten),pos=pos) } ################## toptenV <- function (days=0, domORip='dom') { virus = subset(data, Folder=='Virus') extra="" cpu="Domains" if (days>0) { virus = subset(virus, julian(Sys.time())-julian(virus$Date) max(rank(bydomain))-10); midpts = barplot(as.vector(topten),main=paste("Top Virus ",cpu," ",extra), xlab="Number of Viruses Received",horiz=T,axisnames=F,col="yellow"); pos=c() for (i in as.integer(topten)) { if ( i < mean(as.integer(topten))) { pos = c(pos,4)} else {pos=c(pos,2)} } text(as.integer(topten), midpts, labels=names(topten),pos=pos) } } ################## virusperday <- function (data, days=0) { keep=par(tck=0) virus = subset(data, Folder=='Virus') extra = ""; if (days>0) { virus = subset(virus, julian(Sys.time())-julian(virus$Date)