Code templates

install and load multiple packages

while (dev.cur() > 1) dev.off()
packages <- c("corrplot", "tidyverse", "ggpubr",
              "Hmisc", "parameters", "performance",
              "psych", "see", "sjlabelled", "sjmisc", "sjPlot")
installed_packages <- rownames(installed.packages())
for (pkg in packages) {if (!(pkg %in% installed_packages)) {
  message(paste("Installing package:", pkg))
  install.packages(pkg, dependencies = TRUE)} else {
    message(paste("Package already installed:", pkg))}
  library(pkg, character.only = TRUE)}

install and load a single package

if (!require("PackageNameHere")) install.packages("PackageNameHere", dependencies = TRUE); library("PackageNameHere")

load GSS

required package(s): "sjlabelled"

temp <- tempfile()
download.file("https://drive.google.com/uc?export=download&id=1mF7gMY4aU9amTgYLSVOyVQaHT_opDUbj",temp, mode = "wb")
unzip(temp, files="OrigData/2022/GSS2022.dta",exdir = "OrigData")
gss <- haven::read_dta("OrigData/OrigData/2022/GSS2022.dta")
key <- as.data.frame(get_label(gss))

frequency table (for categorical variables)

required package(s): "sjmisc"

frq(gss$variable_here, out = "v")

descriptive table (for continuous variables)

required package(s): "sjmisc"

descr(gss$variable_here, out = "v", show = "short")

recoding

required package(s): "sjmisc"

(1) merging values (categorical to categorical)

1.2. recoding (merging values with 2 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2=1 [label1]; 
3,4=2 [label2]", append = FALSE)

1.3. recoding (merging values with 3 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2,3=1 [label1]; 
2,4,5=2 [label2];
6,7,8=1 [label3]", append = FALSE)

1.4. recoding (merging values with 4 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2=1 [label1]; 
3,4=2 [label2];
5,6=3 [label3]", append = FALSE)

1.5. recoding (merging values with 5 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2=1 [label1]; 
3,4,5=2 [label2];
6=3 [label3];
7,8=4 [label4];
9,10=5 [label3]", append = FALSE)

1.6. recoding (merging values with 6 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2=1 [label1]; 
3,4,5=2 [label2];
6,7,8=3 [label3];
9,10=4 [label4];
11,12=5 [label5];
13,14,15=6 [label6]", append = FALSE)

1.7. recoding (merging values with 7 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1,2=1 [label1]; 
3,4,5=2 [label2];
6,7,8=3 [label3];
9,10=4 [label4];
11,12=5 [label5];
13,14,15=6 [label6];
16,17,18=7 [label7]", append = FALSE)

(2) reversing values (categorical to categorical)

2.2. recoding (reversing values with 2 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=2 [label1]; 
2=1 [label2]", append = FALSE)

2.3. recoding (reversing values with 3 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=3 [label1];
2=2 [label2];
3=1 [label3]", append = FALSE)

2.4. recoding (reversing values with 4 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=4 [label1];
2=3 [label2];
3=2 [label3];
4=1 [label4]", append = FALSE)

2.5. recoding (reversing values with 5 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=5 [label1]; 
2=4 [label2];
3=3 [label3];
4=2 [label4];
5=1 [label5]", append = FALSE)

2.6. recoding (reversing values with 6 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=6 [label1]; 
2=5 [label2];
3=4 [label3];
4=3 [label4];
5=2 [label5];
6=1 [label6]", append = FALSE)

2.7. recoding (reversing values with 7 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1=7 [label1]; 
2=6 [label2];
3=5 [label3];
4=4 [label4];
5=3 [label5];
6=2 [label6];
7=1 [label7]", append = FALSE)

(3) transforming continuous variables into groups (continuous to categorical)

3.2. recoding (transforming continuous variables into groups with 2 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"0:40=1 [label1]; 
41:100=2 [label2]", append = FALSE)

3.3. recoding (transforming continuous variables into groups with 3 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1:10=1 [label1];
11:20=2 [label2]; 
21:100=3 [label3]", append = FALSE)

3.4. recoding (transforming continuous variables into groups with 4 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"0:10=1 [label1];
11:20=2 [label2]; 
21:40=3 [label3]; 
41:100=4 [label4]", append = FALSE)

3.5. recoding (transforming continuous variables into groups with 5 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1:10=1 [label1];
11:20=2 [label2]; 
21:30=3 [label3]; 
31:40=4 [label4]; 
41:100=5 [label5]", append = FALSE)

3.6. recoding (transforming continuous variables into groups with 6 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1:10=1 [label1];
11:20=2 [label2]; 
21:30=3 [label3]; 
31:40=4 [label4]; 
41:50=5 [label5]; 
51:100=6 [label6]", append = FALSE)

3.7. recoding (transforming continuous variables into groups with 7 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1:10=1 [label1];
11:20=2 [label2]; 
21:30=3 [label3]; 
31:40=4 [label4]; 
41:50=5 [label5]; 
51:60=6 [label6]; 
61:100=7 [label7]", append = FALSE)

3.8. recoding (transforming continuous variables into groups with 8 values)

gss$new_variable_here <- rec(gss$original_variable_here, rec = 
"1:10=1 [label1];
11:20=2 [label2]; 
21:30=3 [label3]; 
31:40=4 [label4]; 
41:50=5 [label5]; 
51:60=6 [label6]; 
61:70=7 [label7]; 
71:100=8 [label8]", append = FALSE)

computing

required package(s): "tidyverse"

computing 1

gss <- gss %>%
  rowwise() %>% 
  mutate (new_variable_here = mean (c(variable_1_here, variable_2_here, variable_3_here)))

computing 2 (with recoding) - sample 1

frq(gss$happy, out = "v")

gss$happynew <- rec(gss$happy, rec = 
"1=3 [very happy]; 
2=2 [pretty happy]; 
3=1 [not too happy]", append = FALSE)


frq(gss$life, out = "v")

gss$lifenew <- rec(gss$life, rec = 
"1=3 [exciting]; 
2=2 [routine]; 
3=1 [dull]", append = FALSE)


frq(gss$satfin, out = "v")

gss$satfinnew <- rec(gss$satfin, rec = 
"1=3 [satisfied]; 
2=2 [more or less]; 
3=1 [not at all]", append = FALSE)


gss <- gss %>%
  rowwise() %>% 
  mutate (hapindex = mean (c(happynew,lifenew,satfinnew)))

computing 3 (with recoding) - sample 2

frq(gss$socrel, out = "v")

gss$socrelnew <- rec(gss$socrel, rec = 
"1=7 [almost daily];
2=6 [once or twice a week]; 
3=5 [several times a month]; 
4=4 [about once a month]; 
5=3 [several times a year]; 
6=2 [about once a year];
7=1 [never]", append = FALSE)


frq(gss$socommun, out = "v")

gss$socommunnew <- rec(gss$socommun, rec = 
"1=7 [almost daily];
2=6 [once or twice a week]; 
3=5 [several times a month]; 
4=4 [about once a month]; 
5=3 [several times a year]; 
6=2 [about once a year];
7=1 [never]", append = FALSE)


frq(gss$socfrend, out = "v")

gss$socfrendnew <- rec(gss$socfrend, rec = 
"1=7 [almost daily];
2=6 [once or twice a week]; 
3=5 [several times a month]; 
4=4 [about once a month]; 
5=3 [several times a year]; 
6=2 [about once a year];
7=1 [never]", append = FALSE)


frq(gss$socbar, out = "v")

gss$socbarnew <- rec(gss$socbar, rec = 
"1=7 [almost daily];
2=6 [once or twice a week]; 
3=5 [several times a month]; 
4=4 [about once a month]; 
5=3 [several times a year]; 
6=2 [about once a year];
7=1 [never]", append = FALSE)

gss <- gss %>%  
rowwise() %>%   
mutate (sociallifeindex = mean (c(socrelnew,socommunnew,socfrendnew, socbarnew)))

chi square

required package(s): "sjPlot"

sjt.xtab(gss$independent_variable_here, gss$dependent_variable_here, show.row.prc = TRUE)

sampling: data creation for subsamples

required package(s): "sjPlot"

non-random (last 100 cases)

25% simple random sample

10% systematic random sample

ttest

required package(s): "tidyverse" | "parameters"

t.test(dependent_variable_here ~ independent_variable_here, data = gss) %>% 
  parameters() %>% 
  display(format="html")

visualization

bar graph (for categorical variables)

required package(s): "sjPlot"

plot_frq(gss$variable_here, type = "bar", geom.colors = "#336699")

histogram (for continuous variables)

required package(s): "sjPlot"

plot_frq(gss$educ, type = "hist", 
         geom.colors = "#336699",
         normal.curve = TRUE,
         normal.curve.color = "#9b2226")

stacked bar graphs for multiple variables

required package(s): "sjPlot" | "tidyverse"

graph <- gss %>%  select (variable_1_here, variable_2_here, variable_3_here, variable_4_here, variable_5_here) %>%  
plot_stackfrq(sort.frq = "first.asc", coord.flip = TRUE, geom.colors = "Blues", show.total = FALSE,                
title = "type graph title here")
graph + theme(  axis.text.x = element_text(size=14), # change font size of x-axis labels  
axis.text.y = element_text(size=14), # change font size of y-axis labels  
plot.title=element_text(size=20), # change font size of plot title  
legend.text = element_text(size=14)) # change font size of legend

stacked bar graphs for multiple variables (flip coordination)

required package(s): "sjPlot" | "tidyverse"

graph <- gss %>%  select (variable_1_here, variable_2_here, variable_3_here, variable_4_here, variable_5_here) %>%  
plot_stackfrq(sort.frq = "first.asc", coord.flip = FALSE, geom.colors = "Blues", show.total = FALSE,                
title = "type graph title here")
graph + theme(  axis.text.x = element_text(size=14), # change font size of x-axis labels  
axis.text.y = element_text(size=14), # change font size of y-axis labels  
plot.title=element_text(size=20), # change font size of plot title  
legend.text = element_text(size=14)) # change font size of legend

stacked bar graphs by different groups

required package(s): "sjPlot"

plot_xtab(gss$variable_1_here, gss$variable_2_here, show.total=FALSE, show.n = FALSE)

bar graphs between groups (margin=row)

required package(s): "sjPlot"

plot_xtab(gss$variable_1_here, gss$variable_2_here, show.total=FALSE, show.n = FALSE, margin = "row")

scatterplot with two continuous variables

required package(s): "sjPlot"

plot_scatter(gss, variable_1_here, variable_2_here, jitter = TRUE,
            fit.grps = lm, show.ci = TRUE, grid = TRUE,
             title = "type graph title here")

scatterplot with two continuous variables by groups

required package(s): "sjPlot"

plot_scatter(gss, variable_1_here, variable_2_here, categoricalvariable_here, jitter = TRUE,
            fit.grps = lm, show.ci = TRUE, grid = TRUE,
             title = "type graph title here")

correlation analysis

correlation analysis structure

Correlation analysis examines the linear relationship of two continuous variables.

IF the p-value is statistically significant (<0.05);

  • less than |0.3| … weak correlation

  • 0.3 < | r | < 0.5 … moderate correlation

  • greater than 0.5 ………. strong correlation

The order of the variables does not matter.

(1) correlation analysis table

required package(s): "sjPlot"

tab_corr (gss[, c("variable_1_here", "variable_2_here")],
wrap.labels = 30, p.numeric = TRUE, triangle="lower", na.deletion = "pairwise")

(2) correlation scatterplot graph

xlab: "what it measures column" of variable 1 (x)

ylab: "what it measures column" of variable 2 (y)

required package(s): "ggpubr"

scatterplot <- ggscatter(gss, x = "variable_1_here", y = "variable_2_here",
           add = "loess", conf.int = TRUE, color = "black", point=F,
           xlab = "what it measures of variable_1", ylab = "what it measures of variable_2")
           scatterplot + stat_cor(p.accuracy = 0.001, r.accuracy = 0.01)

(3) correlation matrix

required package(s): "sjPlot"

tab_corr (gss[, c("variable_1_here", "variable_2_here", "variable_3_here", "variable_4_here", "variable_5_here")], 
wrap.labels = 30, p.numeric = TRUE, triangle="lower", na.deletion = "pairwise")

(4) scatterplot matrix

required package(s): "psych"

pairs.panels(gss[, c("variable_1_here", "variable_2_here", "variable_3_here", "variable_4_here", "variable_5_here")],
ellipses=F, scale=F, show.points=F, stars=T, ci=T)

(5) correlogram

required package(s): "corrplot" | "Hmisc"

selectedvariables <- c("variable_1_here", "variable_2_here", "variable_3_here", "variable_4_here", "variable_5_here")
testRes = cor.mtest(gss[, selectedvariables])
gssrcorr = rcorr(as.matrix(gss[, selectedvariables]))
gsscoeff = gssrcorr$r
corrplot(gsscoeff, p.mat = testRes$p, method = 'pie', type = 'lower', insig='blank',
addCoef.col = 'black', order = 'original', diag = FALSE)$corrPos

linear regression

required package(s): "sjPlot"

(1) linear regression with 1 independent variable

model1 <- lm(depvar ~ indepvar1, data = gss)
tab_model(model1, show.std = T, show.ci = F, collapse.se = T, p.style = "stars")

(2) linear regression with 2 independent variables

model2 <- lm(depvar ~ indepvar1 + indepvar2, data = gss)
tab_model(model2, show.std = T, show.ci = F, collapse.se = T, p.style = "stars")

(3) linear regression with 3 independent variables

model3 <- lm(depvar ~ indepvar1 + indepvar2 + indepvar3, data = gss)
tab_model(model3, show.std = T, show.ci = F, collapse.se = T, p.style = "stars")

(4) linear regression with 4 independent variables

model4 <- lm(depvar ~ indepvar1 + indepvar2 + indepvar3 + indepvar4, data = gss)
tab_model(model4, show.std = T, show.ci = F, collapse.se = T, p.style = "stars")

add more independent variables with a plus (+)

logistic regression

(1) logistic regression with 1 independent variable

model1 <- glm(dummydepvar ~ indepvar1, data = gss, family = binomial(link="logit"))
tab_model(model4, show.std = TRUE, show.ci = FALSE, collapse.se = TRUE, p.style = "stars")

(2) logistic regression with 2 independent variables

model2 <- glm(dummydepvar ~ indepvar1 + indepvar2, data = gss, family = binomial(link="logit"))
tab_model(model4, show.std = TRUE, show.ci = FALSE, collapse.se = TRUE, p.style = "stars")

(3) logistic regression with 3 independent variables

model3 <- glm(dummydepvar ~ indepvar1 + indepvar2 + indepvar3, data = gss, family = binomial(link="logit"))
tab_model(model4, show.std = TRUE, show.ci = FALSE, collapse.se = TRUE, p.style = "stars")

(4) logistic regression with 4 independent variables

model4 <- glm(dummydepvar ~ indepvar1 + indepvar2 + indepvar3 + indepvar4, data = gss, family = binomial(link="logit"))
tab_model(model4, show.std = TRUE, show.ci = FALSE, collapse.se = TRUE, p.style = "stars")

add more independent variables with a plus (+)

dummy variables

DUMMY EXAMPLE

First step: Check the frequency distribution of the original variable to see what the values (1, 2, 3, etc.) mean.

Code:

frq(gss$happy, out = "v")

Second step: Create dummy variables for each category.

Codes:

gss$veryhappy <- ifelse(gss$happy == 1, 1, 0)
gss$prettyhappy <- ifelse(gss$happy == 2, 1, 0)
gss$nottoohappy <- ifelse(gss$happy == 3, 1, 0)

Third step: Do not include (omit) one of the dummy variables in your model. The omitted dummy variable is called “comparison category” and should be used in interpretation as well.

required package(s): no package needed

dummy variable: categorical (binary)

gss$var1newname <- ifelse(gss$var1 == 1, 1, 0)
gss$var2newname <- ifelse(gss$var1 == 2, 1, 0)

dummy variable: nominal/ordinal 1

gss$var1newname <- ifelse(gss$var1 == 1, 1, 0)
gss$var2newname <- ifelse(gss$var1 == 2, 1, 0)
gss$var3newname <- ifelse(gss$var1 == 3, 1, 0)

dummy variable: nominal/ordinal 1 (merging categories)

gss$var1newname <- ifelse(gss$var1 == 1, 1, 0)
gss$var2newname <- ifelse(gss$var1 == 2 | gss$var1 == 3 | gss$var1 == 4, 1, 0)
gss$var3newname <- ifelse(gss$var1 == 5, 1, 0)

dummy variable: nominal/ordinal 2 (merging categories)

gss$var1newname <- ifelse(gss$var1 == 1 | gss$var1 == 2, 1, 0)
gss$var1newname <- ifelse(gss$var1 == 3 | gss$var1 == 4, 1, 0)
gss$var1newname <- ifelse(gss$var1 == 5 | gss$var1 == 6 | gss$var1 == 7, 1, 0)
gss$var1newname <- ifelse(gss$var1 == 8 | gss$var1 == 9 | gss$var1 == 10 | gss$var1 == 11, 1, 0)
gss$var1newname <- ifelse(gss$var1 == 12 | gss$var1 == 13 | gss$var1 == 14 | gss$var1 == 15 | gss$var1 == 16, 1, 0)

dummy variable: continuous

gss$var1newname <- ifelse(gss$var1 <= 10, 1, 0)
gss$var2newname <- ifelse(gss$var1 >= 11 & gss$var1 <= 15, 1, 0)
gss$var3newname <- ifelse(gss$var1 >= 16, 1, 0)

scientific notations (e.g., 2e-16)

options(digits=5, scipen=15)

mean centering

gss$age_cent <-scale(gss$age, center = TRUE, scale = FALSE)

delete the environment

rm(list = ls())

remove categories from a variable

gss <- gss[gss$marital != 2 & gss$marital != 3,]

remove label

gss$educ <- remove_all_labels(gss$educ)

rename variables

gss <- newvariablename(gss, agenew = "age_groups")

change the variable from continuous to categorical

gss$var1 <- to_factor(gss$var1)

change the variable from categorical to continuous

gss$var1 <- as.numeric(as.character(gss$var1))

show codebook

view_df(gss, show.frq = TRUE, show.prc = TRUE, use.viewer=FALSE, show.na = TRUE, max.len = 300)

remove packages

remove.packages ("package_you_want_to_remove")

assigning labels

gss$sex<- set_labels(gss$sex,
labels = c("male" = 1, "female" = 2))

Last updated