Wednesday, 3 September 2008

Benford's Law in "R"

The following is a package to run a Benford's law analysis using the R statistical package. I first did this as a project with the University of Newcastle. It has been updated since with help from staff and will continue to develop.

Package: BenfordsLaw
Version: 0.9
Date: 2006-11-01
Title: An Implementation of Beford's Law
Author: Craig Wright
Maintainer: Craig Wright
Depends: R (>= 2.3.0)
Suggests:
Description: The package has a series of functions that calculate the
Distorion Factor (DF), The Bayesian Factor associated with the
functions and plot the fit of the dataset to a Benford
Distribution.
License: GPL version 2 or newer
URL: http://www.r-project.org, http://www.bdo.com.au
Packaged: Wed Jan 23 16:46:00 2008; 1179
Built: R 2.6.1; i386-pc-mingw32; 2008-01-23 16:46:01; windows


.packageName <- "BenfordsLaw"
# Average mean
"AM" <-
function(x){
# Actual Mean of the data
x.collapsed <- x.coll(x)
AM.value <- sum(x.collapsed)/length(x)
return(AM.value)
}
"BF.Bayesian" <-
function(x,alpha=5) {
x <- x.coll2(x)
pi0 <- alpha/100
n <- length(x)
x.bar <- mean(x)
# for a large sample - the SD of the distribution will approx. the SD for the population
S <- sd(x)
#
theta.o <- 90/(n * (10^(1/n) - 1)) # also set as EM above
#
z.beysian <- sqrt(n)*(x.bar - theta.o)/S
#
b1 <- sqrt(1 + n)/exp(((z.beysian^2)/2)/(1+1/n))
#
b2 <-1/b1
#
alpha.0 <- (1 + ((1 - pi0)/pi0)*exp((z.beysian^2/2)/(1 + 1/n))/sqrt(1+n))
#
bayes <- c()
bayes$b1 <- b1
bayes$b2 <- b2
bayes$alpha <- alpha.0
return (bayes)
}
# Z-statistics for the Distortion Factor model
"BF.Z.statistic" <-
function(x) {
DF.value <- DF(x)
SD.df <- SD.DF(x)
Z.statistic <- DF.value/SD.df
return(Z.statistic)
}
# The main function
#
"Benford.s.Law" <-
function(x,alpha=5) {
#
par(mfrow=c(2, 1))
BenfordObsExp1(x,alpha)
BenfordObsExp12(x,alpha)
bf.z <- BF.Z.statistic(x)
p.value <- dnorm((bf.z), mean=0, sd=1, log = FALSE)
# If p.value< alpha - possible fraud / risk indicated in dataset
ifelse(p.value>(alpha/100),
print("The dataset shows signs of non-conformance. The result is significant", width = 20),
print("The dataset conforms to Benford's Law and the expected values are within the confidence range as set.", width = 20)
)
b.value <- BF.Bayesian(x,alpha)
print("")
print("The Z Statistic is: ")
print(bf.z)
print("")
print("The P.Value is: ")
print(p.value)
print("The Bayseian Value is: ")
print(b.value)
}
################# Plotting first digit frequency #############

"BenfordObsExp1" <-
function(x, alpha=5){
# Modified from the Plot function of Keith Wright
data <- substitute(x)
n <- length(x)
# Peel off the first digit
x <- as.numeric(substring(formatC(x, format = 'e'), 1, 1))
obsFreq <- tabulate(x, nbins = 9)
benFreq <- dbenford(1:9)
benFreq.U <- benford.bound.upper(x, 1:9, alpha)
benFreq.L <- benford.bound.lower(x, 1:9, alpha)
plot(1:9, obsFreq/n, xlim = c(1, 9), ylim = c(0, max(obsFreq/n, benFreq, 0.350)), type = 'h',
main = paste("First Digit Benford's Law Plot for Alpha =", alpha),
sub = paste("(Confidence Intervals are Green, Expected is blue, Observed are Black)"),
xlab = "First Digits", ylab = " Proportions")
axis(1, at = 1:9)
points(1:9, benFreq, col = "black", pch = 23)
lines(spline(1:9, benFreq.U), col = "green", pch = 16)
lines(spline(1:9, benFreq), col = "blue", pch = 16)
lines(spline(1:9, benFreq.L), col = "green", pch = 16)
}
########### Ploting first and second digits frequency #########

"BenfordObsExp12" <-
function(x,alpha=5){
#
data <- substitute(x)
n <- length(x)
# Peel off the first and second digit
x.collapsed2 <- x.coll2(x)
x.tab<- tabulate(x.collapsed2-9)
obsFreq <- tabulate(x.collapsed2-9, nbins=90)
benFreq <- dbenford(10:99)
benFreq.U <- benford.bound.upper(x, 10:99, alpha)
benFreq.L <- benford.bound.lower(x, 10:99,alpha)
plot(10:99, obsFreq/n, xlim = c(10, 99), ylim = c(0, max(obsFreq/n, benFreq, 0.06)), type = 'h',
main = paste("Two Digit Benford's Law Plot for Alpha =", alpha),
sub = paste("(Confidence Intervals are Green, Expected is blue, Observed are Black)"),
xlab = "First Two Digits", ylab = "Proportions")
axis(1, at = 10)
lines(spline(10:99, benFreq.U), col = "green", pch = 16)
lines(spline(10:99, benFreq), col = "blue", pch = 16)
lines(spline(10:99, benFreq.L), col = "green", pch = 16)
}
.packageName <- "BenfordsLaw"
########################################################################################
############################# Calculation of the distortion factor #####################
# See page 66 of Mark J. Nigrini (2000) Benford's Law, 2nd Ed., Global Audit Publications, Vancouver
"x.coll" <-
function(x){
x.collapsed <- (10*x)/(10^as.integer(log10(x)))
return(x.collapsed)
}
# This added another as.integer into the Xcoll equation, but this is not described in the text
"x.coll2" <-
function(x){
x.collapsed <- (10*x)/(10^as.integer(log10(x)))
x.collapsed2 <- as.integer(x.collapsed)
return(x.collapsed2)
}

# Average mean
"AM" <-
function(x){
# Actual Mean of the data
x.collapsed <- x.coll(x)
AM.value <- sum(x.collapsed)/length(x)
return(AM.value)
}


# Expected mean
"EM" <-
function(x){
# Expected Mean of the Data as determined by a Benford Distribution
n <- length(x)
x.collapsed <- x.coll(x)
EM.value <- 90/(n*(10^(1/n)-1))
return(EM.value)
}

# Distortion factor
"DF" <-
function(x){
# Distortion Factor used to create a probability model
n <- length(x)
AM.value <-AM(x)
EM.value <-EM(x)
DF.value <- ((AM(x) - EM(x))/EM(x))
return(DF.value)
}

# Standard deviation of the distortion factor test
"SD.DF" <-
function(x){
# This is the approx. Standard Deviation of the DF which is expected
# if the distribution follows Benford’s Law
n <- length(x)
# Hey, I see hard coded numbers in use!!
SD.df <- 0.63825342/(n)^0.5
return(SD.df)
}

# Z-statistics for the Distortion Factor model
"BF.Z.statistic" <-
function(x) {
DF.value <- DF(x)
SD.df <- SD.DF(x)
Z.statistic <- DF.value/SD.df
return(Z.statistic)
}
# This gives you the p-value from the Distortion Factor model
"BF.p.value" <- function( x )
{
bf.z <- BF.Z.statistic(x)
return ( dnorm((bf.z), mean=0, sd=1, log = FALSE) )
}
########################################################################################

"BF.Bayesian" <-
function(x,alpha=5) {
x <- x.coll2(x)
pi0 <- alpha/100
n <- length(x)
x.bar <- mean(x)
# for a large sample - the SD of the distribution will approx. the SD for the population
S <- sd(x)
#
theta.o <- 90/(n * (10^(1/n) - 1)) # also set as EM above
#
z.beysian <- sqrt(n)*(x.bar - theta.o)/S
#
b1 <- sqrt(1 + n)/exp(((z.beysian^2)/2)/(1+1/n))
#
b2 <-1/b1
#
alpha.0 <- (1 + ((1 - pi0)/pi0)*exp((z.beysian^2/2)/(1 + 1/n))/sqrt(1+n))
#
bayes <- c()
bayes$b1 <- b1
bayes$b2 <- b2
bayes$alpha <- alpha.0
return (bayes)
}

#######################################################################################

# The main function
#
"Benford.s.Law" <-
function(x,alpha=5) {
#
par(mfrow=c(2, 1))
BenfordObsExp1(x,alpha)
BenfordObsExp12(x,alpha)
bf.z <- BF.Z.statistic(x)
p.value <- dnorm((bf.z), mean=0, sd=1, log = FALSE)
# If p.value< alpha - possible fraud / risk indicated in dataset
ifelse(p.value>(alpha/100),
print("The dataset shows signs of non-conformance. The result is significant", width = 20),
print("The dataset conforms to Benford's Law and the expected values are within the confidence range as set.", width = 20)
)
b.value <- BF.Bayesian(x,alpha)
print("")
print("The Z Statistic is: ")
print(bf.z)
print("")
print("The P.Value is: ")
print(p.value)
print("The Bayseian Value is: ")
print(b.value)
}

################# Graphing function ################
# Helper functions for plot
"benford.bound.lower" <-
function(x,y,alpha=5){
n <- length(x)
# replace 1.96 with Z.alpha
p <- 1-(alpha/2)
q <- qnorm((1-(alpha/200)), mean=0, sd=1, log.p = FALSE)
benFreq.lower <- dbenford(y) - q*sqrt(dbenford(y)*(1- dbenford(y))/n) - (1/(2*n))
return(benFreq.lower)
}
"benford.bound.upper" <-
function(x,y,alpha=5){
n <- length(x)
# replace 1.96 with Z.alpha
p <- 1-(alpha/2)
q <- qnorm((1-(alpha/200)), mean=0, sd=1, log.p = FALSE)
benFreq.upper <- dbenford(y) + q*sqrt(dbenford(y)*(1- dbenford(y))/n) + (1/(2*n))
return(benFreq.upper)
}
################# Plotting first digit frequency #############

"BenfordObsExp1" <-
function(x, alpha=5){
# Modified from the Plot function of Keith Wright
data <- substitute(x)
n <- length(x)
# Peel off the first digit
x <- as.numeric(substring(formatC(x, format = 'e'), 1, 1))
obsFreq <- tabulate(x, nbins = 9)
benFreq <- dbenford(1:9)
benFreq.U <- benford.bound.upper(x, 1:9, alpha)
benFreq.L <- benford.bound.lower(x, 1:9, alpha)
plot(1:9, obsFreq/n, xlim = c(1, 9), ylim = c(0, max(obsFreq/n, benFreq, 0.350)), type = 'h',
main = paste("First Digit Benford's Law Plot for Alpha =", alpha),
sub = paste("(Confidence Intervals are Green, Expected is blue, Observed are Black)"),
xlab = "First Digits", ylab = " Proportions")
axis(1, at = 1:9)
points(1:9, benFreq, col = "black", pch = 23)
lines(spline(1:9, benFreq.U), col = "green", pch = 16)
lines(spline(1:9, benFreq), col = "blue", pch = 16)
lines(spline(1:9, benFreq.L), col = "green", pch = 16)
}

########### Ploting first and second digits frequency #########

"BenfordObsExp12" <-
function(x,alpha=5){
#
data <- substitute(x)
n <- length(x)
# Peel off the first and second digit
x.collapsed2 <- x.coll2(x)
x.tab<- tabulate(x.collapsed2-9)
obsFreq <- tabulate(x.collapsed2-9, nbins=90)
benFreq <- dbenford(10:99)
benFreq.U <- benford.bound.upper(x, 10:99, alpha)
benFreq.L <- benford.bound.lower(x, 10:99,alpha)
plot(10:99, obsFreq/n, xlim = c(10, 99), ylim = c(0, max(obsFreq/n, benFreq, 0.06)), type = 'h',
main = paste("Two Digit Benford's Law Plot for Alpha =", alpha),
sub = paste("(Confidence Intervals are Green, Expected is blue, Observed are Black)"),
xlab = "First Two Digits", ylab = "Proportions")
axis(1, at = 10)
lines(spline(10:99, benFreq.U), col = "green", pch = 16)
lines(spline(10:99, benFreq), col = "blue", pch = 16)
lines(spline(10:99, benFreq.L), col = "green", pch = 16)
}

##################################################################
# assimulates the rnorm, pnorm, cnorm functions
# Helper functions for plot
# Expected proportion of the digit x, gives the density
"dbenford" <-
function(x){
log10(1 + 1/x)
}

# cumulative probability frequency, gives the distribution function
# Input: q vector of quantiles.
# d he fixed number of digits in the analyses
"pbenford" <-
function(q, digits = 1){
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
cumprobs <- cumsum(dbenford(digits_range))
return(cumprobs[q])
}
# Quantile function
# Input: p vector of probabilities.
# d the fixed number of digits in the analyses
"qbenford" <-
function(p, digits = 1){
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
cumprobs <- cumsum(dbenford(digits_range))
cumprobs[length(digits_range)] <- 1 # To fix a rounding error
quantiles <- sapply(p, function(x) {10^d - sum(cumprobs >= x)})
return(quantiles)
}
# Benford's Law, generate random deviates
# Input: n number of observations. If length(n) > 1, the length is taken to be the number required.
# d the fixed number of digits in the analyses
"rbenford" <-
function(n, digits = 1){
if ( length(n) > 1)
{
n <- length(n)
}
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
sample( digits_range, size = n, replace = TRUE, prob = dbenford(digits_range))
}





# Distortion factor
"DF" <-
function(x){
# Distortion Factor used to create a probability model
n <- length(x)
AM.value <-AM(x)
EM.value <-EM(x)
DF.value <- ((AM(x) - EM(x))/EM(x))
return(DF.value)
}

# Expected mean
"EM" <-
function(x){
# Expected Mean of the Data as determined by a Benford Distribution
n <- length(x)
x.collapsed <- x.coll(x)
EM.value <- 90/(n*(10^(1/n)-1))
return(EM.value)
}
# Standard deviation of the distortion factor test
"SD.DF" <-
function(x){
# This is the approx. Standard Deviation of the DF which is expected
# if the distribution follows Benford’s Law
n <- length(x)
# Hey, I see hard coded numbers in use!!
SD.df <- 0.63825342/(n)^0.5
return(SD.df)
}
# helper function for plot
"benford.bound.lower" <-
function(x,y,alpha=5){
n <- length(x)
# replace 1.96 with Z.alpha
p <- 1-(alpha/2)
q <- qnorm((1-(alpha/200)), mean=0, sd=1, log.p = FALSE)
benFreq.lower <- dbenford(y) - q*sqrt(dbenford(y)*(1- dbenford(y))/n) - (1/(2*n))
return(benFreq.lower)
}
# Helper functions for plot
"benford.bound.upper" <-
function(x,y,alpha=5){
n <- length(x)
# replace 1.96 with Z.alpha
p <- 1-(alpha/2)
q <- qnorm((1-(alpha/200)), mean=0, sd=1, log.p = FALSE)
benFreq.upper <- dbenford(y) + q*sqrt(dbenford(y)*(1- dbenford(y))/n) + (1/(2*n))
return(benFreq.upper)
}
# Helper functions for plot
# Expected proportion of the digit x, gives the density
"dbenford" <-
function(x){
log10(1 + 1/x)
}
# cumulative probability frequency, gives the distribution function
# Input: q vector of quantiles.
# d he fixed number of digits in the analyses
"pbenford" <-
function(q, digits = 1){
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
cumprobs <- cumsum(dbenford(digits_range))
return(cumprobs[q])
}
# Quantile function
# Input: p vector of probabilities.
# d the fixed number of digits in the analyses
"qbenford" <-
function(p, digits = 1){
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
cumprobs <- cumsum(dbenford(digits_range))
cumprobs[length(digits_range)] <- 1 # To fix a rounding error
quantiles <- sapply(p, function(x) {10^d - sum(cumprobs >= x)})
return(quantiles)
}
# Benford's Law, generate random deviates
# Input: n number of observations. If length(n) > 1, the length is taken to be the number required.
# d the fixed number of digits in the analyses
"rbenford" <-
function(n, digits = 1){
if ( length(n) > 1)
{
n <- length(n)
}
d <- digits
digits_range <- (10^(d-1)):(10^d -1)
sample( digits_range, size = n, replace = TRUE, prob = dbenford(digits_range))
}
# See page 66 of Mark J. Nigrini (2000) Benford's Law, 2nd Ed., Global Audit Publications, Vancouver
"x.coll" <-
function(x){
x.collapsed <- (10*x)/(10^as.integer(log10(x)))
return(x.collapsed)
}
# This added another as.integer into the Xcoll equation, but this is not described in the text
"x.coll2" <-
function(x){
x.collapsed <- (10*x)/(10^as.integer(log10(x)))
x.collapsed2 <- as.integer(x.collapsed)
return(x.collapsed2)
}

2 comments:

Anonymous said...

Hi Criag,

Please pardon the blunt question but does this package have an IT Security application or is it purely for statistical geeks?

I've looked up Benford's law here http://mathworld.wolfram.com/BenfordsLaw.html and it seems to be about the distribution of the first digit in data sets?

So would you use this to profile the first octet of source ip addresses in your firewall logs and then look for variations to detect when you are being "hit" from an unusual source. Not much choice for first digits in IP addresses (0,1,2)so would it work for hex? Aren't IP addresses dimenionless? Think in need a bigger box to think inside :-)

Please put me out my misery.

Thanks

Dai

Craig S Wright said...

I have another post on this topic at http://gse-compliance.blogspot.com/2007/11/implementing-fraud-detection-using.html

Natural distributions are significantly differnt to those that people select.

In accounting this is used to find instances of fraud.

The same can also be applied to log files and other information in databases. This is particularly the case in the event that files are created rabdomly for instance.

The distribution of creation times can provide an indictation of systematic alteration. This methodology can also be used in traffic flow analysis.

I will add more in a separate post later.