Chapter 20 Masking HR data
In the toolbox of every HR Analytics expert, the ability to quickly anonymize data for test and development environments finds an important place. Surely you want to be able to protect confidential data from inappropriate views.
Ensure all needed libraries are installed
library(tidyverse)
library(randNames)
library(smoothmest)
20.0.1 Whitehouse dataset
Load a data set with first and last names and let us preview the dataset
<- read_csv("https://hranalytics.netlify.com/data/2016-Report-White-House-Staff.csv") whitehouse
Let us replace original names with fake names
#Create a set of new names of equal length as the original dataset
<- nrow(whitehouse) %>%
fake rand_names (nat="US")
#Function to capitalise the first letter
<- function(x) {
firstup substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
#Create a dataframe without the original names columns
<- dplyr::select(whitehouse, -Name)
whitehouse_with_no_names
#Create a column of fake last and first names
$newnames <- paste0(firstup(fake$name.last), ", ", firstup(fake$name.first))
fake
#Create a column of fake last and first names of exactly the length of the data frame without the original names
<- fake[1:nrow(whitehouse_with_no_names),]
result
#Bind the dataset without names with the new dataset containing fake names
<- cbind(result$newnames, whitehouse_with_no_names)
whitehouse_masked
#Rename the column title to "Name"
colnames(whitehouse_masked)[1] <- "Name"
Let us replace original names with random numbers
# Set seed for reproducibility of the random numbers
set.seed(42)
# Replace names with random numbers from 1 to 1000
<- whitehouse_masked %>%
whitehouse_no_names mutate(Name = sample(1:1000, nrow(whitehouse_masked)))
Let us anonymise the salary data
We can use four different methods to anonymise data: * Rounding Salary to the nearest ten thousand * Top coding consists of bringing data to an upper limit. * Bottom coding consists of bringing data to a lower limit.
In data analysis exists various types of transformations, reciprocal, logarithm, cube root, square root and square, however in the following we will demonstrate only the square root transformation.
# Rounding Salary to the nearest ten thousand
<- whitehouse_no_names %>%
whitehouse_no_identifiers mutate(Salary = round(Salary, digits = -4))
# Top coding convert the salaries into three categories
<- whitehouse_masked %>%
whitehouse.gen mutate(Salary = ifelse(Salary < 50000, 0,
ifelse(Salary >= 50000 & Salary < 100000, 1, 2)))
# Bottom Coding
<- whitehouse_masked %>%
whitehouse.bottom mutate(Salary = ifelse(Salary<=45000, 45000, Salary))
20.0.2 Fertility dataset
Let us explore deeper anonymisation option of generating synthetic data. We will do on the basis of the fertility dataset available from UCI website: https://archive.ics.uci.edu/ml/datasets/Fertility
It needs to be loaded first and let us preview it.
<- read_csv("https://https://hranalytics.netlify.com/data/fertility_Diagnosis.txt", col_names = FALSE) fertility
# Let us assign significant column titles
colnames(fertility) <- c("Season", "Age", "Child_Disease", "Accident_Trauma", "Surgical_Intervention","High_Fevers", "Alcohol_Consumption","Smoking_Habit","Hours_Sitting","Diagnosis")
# View fertility data
fertility
# A tibble: 100 x 10
Season Age Child_Disease Accident_Trauma Surgical_Interv~ High_Fevers
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.33 0.69000 0 1 1 0
2 -0.33 0.94 1 0 1 0
3 -0.33 0.5 1 0 0 0
4 -0.33 0.75 0 1 1 0
5 -0.33 0.67 1 1 0 0
6 -0.33 0.67 1 0 1 0
7 -0.33 0.67 0 0 0 -1
8 -0.33 1 1 1 1 0
9 1 0.64 0 0 1 0
10 1 0.61 1 0 0 0
# ... with 90 more rows, and 4 more variables: Alcohol_Consumption <dbl>,
# Smoking_Habit <dbl>, Hours_Sitting <dbl>, Diagnosis <chr>
Let us examine the dataset.
More on the data attributes is availalble here: https://archive.ics.uci.edu/ml/datasets/Fertility
# Number of participants with Surgical_Intervention and Diagnosis
%>%
fertility group_by(Diagnosis) %>%
summarise_at(vars(Surgical_Intervention), sum)
# A tibble: 2 x 2
Diagnosis Surgical_Intervention
<chr> <dbl>
1 N 44
2 O 7
# Number of participants with Surgical_Intervention and Diagnosis
%>%
fertility summarise_at(vars(Age), funs(mean, sd))
# A tibble: 1 x 2
mean sd
<dbl> <dbl>
1 0.669 0.121319
# Counts of the Groups in High_Fevers
%>%
fertility count(High_Fevers)
# A tibble: 3 x 2
High_Fevers n
<dbl> <int>
1 -1 9
2 0 63
3 1 28
# Counts of the Groups in Child_Disease and Accident_Trauma
%>%
fertility count(Child_Disease,Accident_Trauma)
# A tibble: 4 x 3
Child_Disease Accident_Trauma n
<dbl> <dbl> <int>
1 0 0 10
2 0 1 3
3 1 0 46
4 1 1 41
# Calculate the average of Child_Disease
%>%
fertility summarise_at(vars(Child_Disease), mean)
# A tibble: 1 x 1
Child_Disease
<dbl>
1 0.87
In the following we will generate synthetic data sampling from a normal distribution through the subsequent steps: 1. First create a new dataset called “fert,” after applying a log transformation on the hours sitting variable. 2. Calculate the average and the standard deviation 3. Set a seed for reproducibility 4. Generate new data normally distributed for the hours sitting variable 5. Retransform back the log variable using exponential. 6. Hard bound data not falling in the right range 7. Recheck the range. 8. Substitute the synthtic data back into the initial fert dataset.
<- fertility %>%
fert mutate(Hours_Sitting = log(Hours_Sitting))
%>%
fert summarise_at(vars(Hours_Sitting), funs(mean, sd))
# A tibble: 1 x 2
mean sd
<dbl> <dbl>
1 -1.01224 0.504779
set.seed(42)
<- rnorm(100, -1.01, 0.50)
hours.sit
<- exp(hours.sit)
hours.sit
< 0] <- 0
hours.sit[hours.sit > 1] <- 1
hours.sit[hours.sit
range(hours.sit)
[1] 0.0815 1.0000
$Hours_Sitting <- hours.sit fert
In data analysis exists various types of transformations, reciprocal, logarithm, cube root, square root and square, however in the following we will demonstrate only the square root transformation.
# Square root Transformation of Salary
<- whitehouse_masked %>%
whitehouse.salary mutate(Salary = sqrt(Salary))
# Calculate the mean and standard deviation
<- whitehouse.salary %>%
stats summarise_at(vars(Salary), funs(mean, sd))
# Generate Synthetic data with the same mean and standard deviation
<- rnorm(nrow(whitehouse_masked), mean(whitehouse.salary$Salary), sd(whitehouse.salary$Salary))
salary_transformed
# Power transformation
<- salary_transformed^2
salary_original
# Hard bound
<- ifelse(salary_original < 0, 0, salary_original) salary
Let us introduce now the concept of differential privacy, a mathematical concept used by big names like Google, Census Bureau and Apple. Why Differential Privacy? It quantifies the privacy loss via a privacy budget, called epsilon. It assumes the worst-case scenario about the data intruder. Smaller privacy budget means less information or a noiser answer, however epsilon cannot be zero or lower.
Global Sensitivity of Other Queries n is total number of observations a is the lower bound of the data b is the upper bound of the data Counting: 1 Proportion: 1 / n Mean: (b - a) / n
small global sensitivity results in less noise large global sensitivity results in more noise
Number of observations n <- nrow(fertility)
Global sensitivity of counts gs.count <- 1
Global sensitivity of proportions gs.prop <- 1/n
Lower bound a <- 0
Upper bound b <- 1
Global sensitivity of mean gs.mean <- (b - a) / n
Global sensitivity of proportions gs.var <- (b - a)^2 / n
%>%
fertility summarise_at(vars(Child_Disease), sum)
# A tibble: 1 x 1
Child_Disease
<dbl>
1 87
library(smoothmest)
#rdoublex(draws, mean, shaping)
#rdoublex is a random number generator. It creates a vector of random numbers generated by the double exponential distribution.
The double exponential distribution is also commonly referred to as the Laplace distribution. The following is the plot of the double exponential probability density function.
set.seed(42)
rdoublex(1, 87, 1 / 10)
[1] 87
#[1] 87.01983
set.seed(42)
rdoublex(1, 87, 1 / 0.1)
[1] 85
#[1] 88.98337
Sequential Composition
Suppose a set of privacy mechanisms M are sequentially performed on a dataset, and each M provides the max epsilon privacy guarantee.
The sequential composition undertakes the privacy guarantee for a sequence of differentially private computations. When a set of randomized mechanisms has been performed sequentially on a dataset, the final privacy guarantee is determined by the summation of total privacy budgets.
The privacy budget must be divided by two.
#Set Value of Epsilon
<- 0.1 / 2
eps
# GS of Mean and Variance
<- 0.01
gs.mean <- 0.01
gs.var
# Apply the Laplace mechanism
set.seed(42)
rdoublex(1, 0.41, gs.mean / eps)
[1] 0.37
#[1] 0.4496674
rdoublex(1, 0.19, gs.var / eps)
[1] 0.247
#[1] 0.2466982
Parallel Composition
Suppose a set of privacy mechanisms M are sequentially performed on a dataset, and each M provides the sum epsilon privacy guarantee.
The sequential composition undertakes the privacy guarantee for a sequence of differentially private computations. When a set of randomized mechanisms has been performed sequentially on a dataset, the final privacy guarantee is determined by the summation of total privacy budgets.
The privacy budget does not need to be divided. The query with the most epsilon is the budget for the data.
#High_Fevers and Mean of Hours_Sitting
%>%
fertility filter(High_Fevers >= 0) %>%
summarise_at(vars(Hours_Sitting), mean)
# A tibble: 1 x 1
Hours_Sitting
<dbl>
1 0.393297
#Hours_Sitting 0.3932967
# No High_Fevers and Mean of Hours_Sitting
%>%
fertility filter(High_Fevers == -1) %>%
summarise_at(vars(Hours_Sitting), mean)
# A tibble: 1 x 1
Hours_Sitting
<dbl>
1 0.543333
#Hours_Sitting 0.5433333
#Set Value of Epsilon
<- 0.1
eps
# GS of mean for Hours_Sitting
<- 1 / 100
gs.mean
# Apply the Laplace mechanism
set.seed(42)
rdoublex(1, 0.39, gs.mean / eps)
[1] 0.37
#[1] 0.4098337
rdoublex(1, 0.54, gs.mean / eps)
[1] 0.568
#[1] 0.5683491
Prepping up data
# Set Value of Epsilon
<- 0.01
eps # GS of counts
<- 1
gs.count
%>%
fertility count(Smoking_Habit)
# A tibble: 3 x 2
Smoking_Habit n
<dbl> <int>
1 -1 56
2 0 23
3 1 21
#Smoking Count
#-1 56
# 0 23
# 1 21
#Apply the Laplace mechanism
set.seed(42)
<- rdoublex(1, 56, gs.count / eps / 2) %>%
smoking1 round()
<- rdoublex(1, 23, gs.count / eps / 2) %>%
smoking2 round()
# Post-process based on previous queries
<- nrow(fertility) - smoking1 - smoking2
smoking3
# Checking the noisy answers
smoking1
[1] 46
#[1] 60
smoking2
[1] 37
#[1] 29
smoking3
[1] 17
#[1] 11
Impossible and Inconsistent Answers
# Set Value of Epsilon
<- 0.01
eps # GS of counts
<- 1
gs.count
# Display Participants with Abnormal Diagnosis
<- fertility %>% filter(Diagnosis=="O") %>% summarise(sum_x1 = sum(Diagnosis=="O"))
Number_abnormal
#Negative Counts: Applying the Laplace mechanism
# Apply the Laplace mechanism and set.seed(22)
set.seed(22)
rdoublex(1, 12, gs.count / eps) %>%
round()
[1] -79
#[1] -79
# Apply the Laplace mechanism and set.seed(22)
set.seed(22)
rdoublex(1, 12, gs.count / eps) %>%
round() %>%
max(0)
[1] 0
#[1] 0
# Suppose we set a different seed
set.seed(12)
<- rdoublex(1, 12, gs.count / eps) %>%
noisy_answer round() %>%
max(0)
<- nrow(fertility)
n
# ifelse example
ifelse(noisy_answer > n, n, noisy_answer)
[1] 100
#[1] 100
#Normalising
# Set Value of Epsilon
<- 0.01
eps # GS of Counts
<- 1
gs.count %>%
fertility count(Smoking_Habit)
# A tibble: 3 x 2
Smoking_Habit n
<dbl> <int>
1 -1 56
2 0 23
3 1 21
#Smoking Count
# -1 56
# 0 23
# 1 21
# Apply the Laplace mechanism and set.seed(42)
set.seed(42)
<- rdoublex(1, 56, gs.count / eps / 2) %>%
smoking1 max(0)
<- rdoublex(1, 23, gs.count / eps / 2) %>%
smoking2 max(0)
<- rdoublex(1, 21, gs.count / eps / 2) %>%
smoking3 max(0)
# Checking the noisy answers
<- c(smoking1, smoking2, smoking3)
smoking smoking
[1] 46.1 37.2 44.7
#[1] 65.91684 37.17455 0.00000
# Normalize smoking
<- (smoking/sum(smoking)) * (nrow(fertility))
normalized
# Round the values
round(normalized)
[1] 36 29 35
#[1] 64 36 0