People Analytics

An exploratory analysis for a HR dataset

Read Data

data = read_csv('https://raw.githubusercontent.com/ariannalangwang/Analyze-HR-Data/master/HRDataProject/HRDataset_v13.csv') %>% 
   clean_names()
data
## # A tibble: 310 × 35
##    employee_name      emp_id married_id marital_status_… gender_id emp_status_id
##    <chr>               <dbl>      <dbl>            <dbl>     <dbl>         <dbl>
##  1 Brown, Mia         1.10e9          1                1         0             1
##  2 LaRotonda, William 1.11e9          0                2         1             1
##  3 Steans, Tyrone     1.30e9          0                0         1             1
##  4 Howard, Estelle    1.21e9          1                1         0             1
##  5 Singh, Nan         1.31e9          0                0         0             1
##  6 Smith, Leigh Ann   7.11e8          1                1         0             5
##  7 Bunbury, Jessica   1.50e9          1                1         0             5
##  8 Carter, Michelle   1.40e9          0                0         0             1
##  9 Dietrich, Jenna    1.41e9          0                0         0             1
## 10 Digitale, Alfred   1.31e9          1                1         1             1
## # … with 300 more rows, and 29 more variables: dept_id <dbl>,
## #   perf_score_id <dbl>, from_diversity_job_fair_id <dbl>, pay_rate <dbl>,
## #   termd <dbl>, position_id <dbl>, position <chr>, state <chr>, zip <chr>,
## #   dob <chr>, sex <chr>, marital_desc <chr>, citizen_desc <chr>,
## #   hispanic_latino <chr>, race_desc <chr>, dateof_hire <chr>,
## #   dateof_termination <chr>, term_reason <chr>, employment_status <chr>,
## #   department <chr>, manager_name <chr>, manager_id <dbl>, …

Explore Data

skimr::skim(data) #summary
Table 1: Data summary
Name data
Number of rows 310
Number of columns 35
_______________________
Column type frequency:
character 19
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
employee_name 0 1.00 8 23 0 310 0
position 0 1.00 3 28 0 31 0
state 0 1.00 2 2 0 28 0
zip 0 1.00 5 5 0 158 0
dob 0 1.00 8 8 0 306 0
sex 0 1.00 1 1 0 2 0
marital_desc 0 1.00 6 9 0 5 0
citizen_desc 0 1.00 10 19 0 3 0
hispanic_latino 0 1.00 2 3 0 4 0
race_desc 0 1.00 5 32 0 6 0
dateof_hire 0 1.00 8 10 0 99 0
dateof_termination 207 0.33 8 8 0 93 0
term_reason 1 1.00 5 32 0 17 0
employment_status 0 1.00 6 22 0 5 0
department 0 1.00 5 20 0 6 0
manager_name 0 1.00 8 18 0 21 0
recruitment_source 0 1.00 5 38 0 23 0
performance_score 0 1.00 3 17 0 4 0
last_performance_review_date 103 0.67 8 9 0 42 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
emp_id 0 1.00 1.199745e+09 1.8296e+08 602000312.00 1.101024e+09 1.203032e+09 1.378814e+09 1988299991 ▁▇▇▁▁
married_id 0 1.00 4.000000e-01 4.9000e-01 0.00 0.000000e+00 0.000000e+00 1.000000e+00 1 ▇▁▁▁▅
marital_status_id 0 1.00 8.100000e-01 9.4000e-01 0.00 0.000000e+00 1.000000e+00 1.000000e+00 4 ▇▇▂▁▁
gender_id 0 1.00 4.300000e-01 5.0000e-01 0.00 0.000000e+00 0.000000e+00 1.000000e+00 1 ▇▁▁▁▆
emp_status_id 0 1.00 2.400000e+00 1.8000e+00 1.00 1.000000e+00 1.000000e+00 5.000000e+00 5 ▇▁▁▁▃
dept_id 0 1.00 4.610000e+00 1.0800e+00 1.00 5.000000e+00 5.000000e+00 5.000000e+00 6 ▁▂▁▇▁
perf_score_id 0 1.00 2.980000e+00 5.8000e-01 1.00 3.000000e+00 3.000000e+00 3.000000e+00 4 ▁▁▁▇▁
from_diversity_job_fair_id 0 1.00 9.000000e-02 2.9000e-01 0.00 0.000000e+00 0.000000e+00 0.000000e+00 1 ▇▁▁▁▁
pay_rate 0 1.00 3.128000e+01 1.5380e+01 14.00 2.000000e+01 2.400000e+01 4.531000e+01 80 ▇▂▂▂▁
termd 0 1.00 3.300000e-01 4.7000e-01 0.00 0.000000e+00 0.000000e+00 1.000000e+00 1 ▇▁▁▁▃
position_id 0 1.00 1.684000e+01 6.2300e+00 1.00 1.800000e+01 1.900000e+01 2.000000e+01 30 ▂▁▁▇▁
manager_id 8 0.97 1.458000e+01 8.0900e+00 1.00 1.000000e+01 1.500000e+01 1.900000e+01 39 ▅▇▇▁▁
engagement_survey 0 1.00 3.330000e+00 1.2900e+00 1.03 2.080000e+00 3.470000e+00 4.520000e+00 5 ▅▅▃▅▇
emp_satisfaction 0 1.00 3.890000e+00 9.1000e-01 1.00 3.000000e+00 4.000000e+00 5.000000e+00 5 ▁▁▇▇▇
special_projects_count 0 1.00 1.210000e+00 2.3500e+00 0.00 0.000000e+00 0.000000e+00 0.000000e+00 8 ▇▁▁▂▁
days_late_last30 103 0.67 0.000000e+00 0.0000e+00 0.00 0.000000e+00 0.000000e+00 0.000000e+00 0 ▁▁▇▁▁
DataExplorer::plot_bar(data)

Pivot Table

rpivotTable::rpivotTable(data,
    height = 1500)

Check for duplicates

(nrow(get_dupes(data)) == 0) #duplicates?
## [1] TRUE

Check reduntant information

data %>% distinct(hispanic_latino)
## # A tibble: 4 × 1
##   hispanic_latino
##   <chr>          
## 1 No             
## 2 Yes            
## 3 yes            
## 4 no
data <- data %>% mutate(
    hispanic_latino = as_factor(hispanic_latino),
    hispanic_latino = recode(data$hispanic_latino,
    yes = 'Yes', no = 'No'))

Missing data

#missing data?
data %>% miss_var_summary()
## # A tibble: 35 × 3
##    variable                     n_miss pct_miss
##    <chr>                         <int>    <dbl>
##  1 dateof_termination              207   66.8  
##  2 last_performance_review_date    103   33.2  
##  3 days_late_last30                103   33.2  
##  4 manager_id                        8    2.58 
##  5 term_reason                       1    0.323
##  6 employee_name                     0    0    
##  7 emp_id                            0    0    
##  8 married_id                        0    0    
##  9 marital_status_id                 0    0    
## 10 gender_id                         0    0    
## # … with 25 more rows
data %>% gg_miss_upset()

# plot missing data (using raw data)
DataExplorer::plot_missing(
  title   = "% of Missing Data",
  data    = data,
  ggtheme = tidyquant::theme_tq(), 
  missing_only = TRUE) 

#Term Reason column
data %>% select(dateof_termination, term_reason) %>% count(term_reason,sort=T) 
## # A tibble: 18 × 2
##    term_reason                          n
##    <chr>                            <int>
##  1 N/A - still employed               196
##  2 Another position                    20
##  3 unhappy                             14
##  4 more money                          11
##  5 N/A - Has not started yet           11
##  6 career change                        9
##  7 hours                                9
##  8 attendance                           7
##  9 relocation out of area               5
## 10 return to school                     5
## 11 military                             4
## 12 performance                          4
## 13 retiring                             4
## 14 maternity leave - did not return     3
## 15 medical issues                       3
## 16 no-call, no-show                     3
## 17 gross misconduct                     1
## 18 <NA>                                 1
#it seems there is a missing value here, we drop the row
data <- data %>% drop_na(term_reason)

#Manager ID
#Count how many employees has each manager
data %>% select(manager_name, manager_id) %>% count(manager_name,sort=T) 
## # A tibble: 21 × 2
##    manager_name       n
##    <chr>          <int>
##  1 Elijiah Gray      22
##  2 Kelley Spirea     22
##  3 Kissy Sullivan    22
##  4 Michael Albert    22
##  5 Amy Dunn          21
##  6 Brannon Miller    21
##  7 David Stanley     21
##  8 Ketsia Liebig     21
##  9 Webster Butler    21
## 10 Janet King        19
## # … with 11 more rows
#Which manager has not id values?
data %>% select(manager_name, manager_id) %>% filter(is.na(manager_id))
## # A tibble: 8 × 2
##   manager_name   manager_id
##   <chr>               <dbl>
## 1 Webster Butler         NA
## 2 Webster Butler         NA
## 3 Webster Butler         NA
## 4 Webster Butler         NA
## 5 Webster Butler         NA
## 6 Webster Butler         NA
## 7 Webster Butler         NA
## 8 Webster Butler         NA
# Webster Butler has not id, let'see if it applies for other rows.
data %>% select(manager_name, manager_id) %>% filter(manager_name == 'Webster Butler')
## # A tibble: 21 × 2
##    manager_name   manager_id
##    <chr>               <dbl>
##  1 Webster Butler         39
##  2 Webster Butler         NA
##  3 Webster Butler         NA
##  4 Webster Butler         39
##  5 Webster Butler         39
##  6 Webster Butler         39
##  7 Webster Butler         39
##  8 Webster Butler         39
##  9 Webster Butler         39
## 10 Webster Butler         39
## # … with 11 more rows
# As expected just few rows did not reported his id, so we impute with his reported id of 39
#fill with 39 where is.na == TRUE
data$manager_id[is.na(data$manager_id)] <- 39

data %>% select(dateof_termination,
                                last_performance_review_date,
                                days_late_last30)
## # A tibble: 309 × 3
##    dateof_termination last_performance_review_date days_late_last30
##    <chr>              <chr>                                   <dbl>
##  1 <NA>               1/15/2019                                   0
##  2 <NA>               1/17/2019                                   0
##  3 <NA>               1/18/2019                                   0
##  4 <NA>               1/15/2019                                   0
##  5 09/25/13           <NA>                                       NA
##  6 08/02/14           <NA>                                       NA
##  7 <NA>               1/21/2019                                   0
##  8 <NA>               1/29/2019                                   0
##  9 <NA>               1/30/2019                                   0
## 10 <NA>               1/17/2019                                   0
## # … with 299 more rows
unique(data$days_late_last30)
## [1]  0 NA
data %>% select(-days_late_last30)
## # A tibble: 309 × 34
##    employee_name      emp_id married_id marital_status_… gender_id emp_status_id
##    <chr>               <dbl>      <dbl>            <dbl>     <dbl>         <dbl>
##  1 Brown, Mia         1.10e9          1                1         0             1
##  2 LaRotonda, William 1.11e9          0                2         1             1
##  3 Steans, Tyrone     1.30e9          0                0         1             1
##  4 Singh, Nan         1.31e9          0                0         0             1
##  5 Smith, Leigh Ann   7.11e8          1                1         0             5
##  6 Bunbury, Jessica   1.50e9          1                1         0             5
##  7 Carter, Michelle   1.40e9          0                0         0             1
##  8 Dietrich, Jenna    1.41e9          0                0         0             1
##  9 Digitale, Alfred   1.31e9          1                1         1             1
## 10 Friedman, Gerry    1.20e9          0                0         1             1
## # … with 299 more rows, and 28 more variables: dept_id <dbl>,
## #   perf_score_id <dbl>, from_diversity_job_fair_id <dbl>, pay_rate <dbl>,
## #   termd <dbl>, position_id <dbl>, position <chr>, state <chr>, zip <chr>,
## #   dob <chr>, sex <chr>, marital_desc <chr>, citizen_desc <chr>,
## #   hispanic_latino <chr>, race_desc <chr>, dateof_hire <chr>,
## #   dateof_termination <chr>, term_reason <chr>, employment_status <chr>,
## #   department <chr>, manager_name <chr>, manager_id <dbl>, …

It seems that no employees came late in the last 30 days as the column days_late_last30 presents only 0 values and NA for the rows wherein the employee has already left the company. So I will drop the column.

unique(data[,c("dateof_termination","term_reason")])
## # A tibble: 103 × 2
##    dateof_termination term_reason              
##    <chr>              <chr>                    
##  1 <NA>               N/A - still employed     
##  2 09/25/13           career change            
##  3 08/02/14           Another position         
##  4 09/05/15           attendance               
##  5 10/31/14           relocation out of area   
##  6 <NA>               N/A - Has not started yet
##  7 09/12/15           performance              
##  8 03/15/15           no-call, no-show         
##  9 02/22/15           no-call, no-show         
## 10 05/01/16           performance              
## # … with 93 more rows
employees <- data %>% filter(
        is.na(dateof_termination)) %>% 
    select(-dateof_termination)

term_employees <- data %>% filter(
        is.na(dateof_termination) == F)

Data Visualization

Gender Gap

employees %>% select(sex, pay_rate, 
                    dateof_hire, position) %>% mutate(
        position = as.factor(position),
        position = fct_lump(position, 5),
        dateof_hire=anytime::anydate(dateof_hire),
        sex = as_factor(sex),
        year = year(dateof_hire)) %>% 
group_by(year,sex, position) %>% summarise(
    avg = mean(pay_rate)) %>% 
    ggplot(aes(x=year, y=avg, color =sex)) + geom_line() + facet_wrap(~position) + ggthemes::theme_gdocs() + theme(
        axis.text.x = element_text(angle = 35)
    )

Barplot

data %>%
  filter(term_reason=="N/A - still employed") %>%
  group_by(sex,position) %>%
  summarise(Total=n()) %>% 
  hchart(type="column",
        hcaes(x=position,y=Total,group=sex)) %>%
      hc_add_theme(hc_theme_economist())
data %>% group_by(position) %>% summarise(pay_rate = round(mean(pay_rate),2)) %>% 
    hchart("column", hcaes(x = position, y = pay_rate)) %>%
    hc_title(text = "Company Pay Rates") %>%
      hc_subtitle(text = "Average pay rate by job position") %>%
      hc_add_theme(hc_theme_economist())

Boxplot

highchart() %>%
  hc_xAxis(type ="category")%>%
  hc_add_series_list(data_to_boxplot(data,
                                pay_rate, race_desc,
                                group_var = sex,
                                add_outliers = F)) %>% 
  hc_legend(enabled= F) %>%
      hc_add_theme(hc_theme_economist())

Pie Charts

race_pie <- data %>% count(race_desc) %>% 
    hchart('pie', hcaes(race_desc,n)) %>% hc_add_theme(hc_theme_economist()) 

gender_pie <- data %>% count(sex) %>% 
    hchart('pie', hcaes(sex,n)) %>% 
    hc_add_theme(hc_theme_economist())

department_pie <- data %>% count(department) %>% 
    hchart('pie', hcaes(department, n)) %>% 
    hc_add_theme(hc_theme_economist())

managers_pie <- data %>% count(manager_name) %>% 
    hchart('pie', hcaes(manager_name, n)) %>% 
    hc_add_theme(hc_theme_economist())

hw_grid(race_pie, gender_pie, 
                department_pie, managers_pie,
                ncol = 2, rowheight = 350)

Waffle Plot

pacman::p_load(waffle)

View(employees %>% select(marital_desc,performance_score) %>% 
            group_by(marital_desc,performance_score) %>% 
    summarise(n= n()) %>% mutate(freq = round(n/sum(n),2)))
  
waffle(c('Exceeds = 15%' = 15,
                 'Fully Meets = 78%' =78,
                 'Needs Improvement = 4%' = 4,
                 'PIP = 3%' = 3), 
             rows = 8, size = 0.2,
             title = 'Performance Score For Single Employees') + 
    theme(title = element_text(
        family = 'Arial',
        size = 1, face = 'bold'
    ))

Ridgeline Plots

library(ggridges)
ggplot(data, aes(
    y=as_factor(race_desc),
    x=pay_rate,
    fill=sex
)) + geom_density_ridges(alpha=0.7) + theme_minimal()

Treemap

data$manager_name <- as_factor(data$manager_name)

data %>% mutate(
    manager_name = fct_lump(manager_name,15)) %>%
    group_by(manager_name) %>% count() %>% 
treemap::treemap(
    index = 'manager_name',
    vSize = 'n',
    title = "Treemap: Managers names by number of employees")

Chicklet chart

term_employees %>% head() %>% 
    miss_var_summary() 
## # A tibble: 35 × 3
##    variable                     n_miss pct_miss
##    <chr>                         <int>    <dbl>
##  1 last_performance_review_date      6      100
##  2 days_late_last30                  6      100
##  3 employee_name                     0        0
##  4 emp_id                            0        0
##  5 married_id                        0        0
##  6 marital_status_id                 0        0
##  7 gender_id                         0        0
##  8 emp_status_id                     0        0
##  9 dept_id                           0        0
## 10 perf_score_id                     0        0
## # … with 25 more rows
term_employees %<>% select(-last_performance_review_date,
                                                     -days_late_last30)

term_employees %>% select(dateof_hire, dateof_termination)
## # A tibble: 102 × 2
##    dateof_hire dateof_termination
##    <chr>       <chr>             
##  1 9/26/2011   09/25/13          
##  2 8/15/2011   08/02/14          
##  3 7/7/2014    09/05/15          
##  4 3/7/2011    10/31/14          
##  5 7/7/2014    09/12/15          
##  6 2/16/2015   03/15/15          
##  7 2/16/2015   02/22/15          
##  8 12/1/2014   05/01/16          
##  9 1/5/2015    10/31/15          
## 10 1/9/2012    11/04/15          
## # … with 92 more rows
library(lubridate)
term_employees %<>% mutate(
    dateof_hire=anytime::anydate(dateof_hire),
    dateof_termination=as.Date(dateof_termination,
                                                         format("%m/%d/%y")),
    worktime = difftime(dateof_termination, dateof_hire)
)


term_employees %>% select(position, worktime) %>% 
    mutate(position=as.factor(term_employees$position),
                 worktime = as.integer(worktime)) %>% 
    group_by(position) %>% summarise(
        avg = round(mean(worktime),0)) %>% 
    arrange(desc(avg)) %>% 
    ggplot(aes(x=reorder(position,avg), y=avg)) +
    ggchicklet::geom_chicklet(aes(fill=avg),
                width = 0.8, radius = grid::unit(5, "pt")) + coord_flip() + scale_fill_gradient(low = '#4B3F72', 
                                             high = '#7C9EB2') +    
    coord_flip() + 
    scale_y_discrete(expand = c(0, 0)) +
    labs(title = "Number of days before quitting job",
             subtitle = "By position",
             caption = "Data: Kaggle",
             fill = "Days") +
    theme(
        aspect.ratio=4/3,
        plot.title = element_text(face = "bold",
                                                            size = 13,
                                                            family = "Times",
                                                            colour = "black"),
        plot.subtitle = element_text(size = 11.7,
                                                                 family = "Times",
                                                                 colour = "grey45"),
        plot.caption = element_text(size = 10,
                                                                family = "Times",
                                                                colour = "grey45"),
        axis.text.y = element_text(vjust = 0.5,
                                                             family  = "Times"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.justification= c(1,0),
        legend.key.size = unit(15, "pt"),
        legend.text = element_text(size = 10,
                                                             family = "Times"),
        legend.position = c(1,0))

Time series of people quitting

term_employees %>% select(dateof_hire,
    dateof_termination, worktime, 
    employee_name,term_reason) %>% 
    arrange(worktime) %>% filter( 
    worktime < quantile(worktime, 0.05) |
    worktime > quantile(worktime, 0.95)) %>% 
    mutate(term_reason = str_to_title(term_reason))
## # A tibble: 12 × 5
##    dateof_hire dateof_termination worktime  employee_name      term_reason      
##    <date>      <date>             <drtn>    <chr>              <chr>            
##  1 2012-09-24  2012-09-26            2 days MacLennan, Samuel  Hours            
##  2 2011-01-10  2011-01-12            2 days Baczenski, Rachael Another Position 
##  3 2015-02-16  2015-02-22            6 days Hernandez, Daniff  No-Call, No-Show 
##  4 2014-02-17  2014-02-25            8 days Evensen, April     No-Call, No-Show 
##  5 2011-11-07  2011-11-15            8 days Gerke, Melisa      Hours            
##  6 2011-05-16  2011-06-04           19 days Power, Morissa     Another Position 
##  7 2011-01-10  2016-01-26         1842 days Robinson, Alain    Attendance       
##  8 2011-01-10  2016-04-01         1908 days Jung, Judy         Unhappy          
##  9 2011-01-10  2016-05-17         1954 days Robinson, Cherly   Attendance       
## 10 2009-10-26  2015-04-08         1990 days Sloan, Constance   Maternity Leave …
## 11 2010-10-25  2016-05-18         2032 days Peterson, Ebonee   Another Position 
## 12 2008-09-02  2015-09-29         2583 days Ybarra, Catherine  Another Position

Network Visualization

library(igraph)
library(networkD3)
employees %>% select(manager_name, employee_name) %>% head(30) %>% simpleNetwork(
    fontSize = 13, charge = -15,
    linkDistance = 65, zoom = T,
    nodeColour = 'red',
)
comments powered by Disqus