This analysis explores the relationship between tobacco use and cardiovascular health indicators (blood pressure and heart rate). The dataset includes measures of systolic BP, diastolic BP, and HR across smoking intensity and sex.
# Load dataset
heart <- read.csv("../data/dataset.csv")
# Select relevant variables
heart <- heart %>%
select(randid, sex, age, sysbp, diabp, cigpday, heartrte, prevhyp, hyperten, timehyp)
# Filter for smokers only
smokers <- heart %>% filter(cigpday >= 1)
# Create smoker intensity groups
smokers <- smokers %>%
mutate(
smkerst = case_when(
cigpday <= 5 ~ "Light",
cigpday >= 6 & cigpday <= 20 ~ "Moderate",
cigpday >= 21 ~ "Heavy"
),
sex = factor(sex, labels = c("Male", "Female")),
prevhyp = factor(prevhyp, labels = c("Free", "Prev")),
hyperten = factor(hyperten, labels = c("Free", "Prev")),
smkerst = factor(smkerst, levels = c("Light", "Moderate", "Heavy"))
)
glimpse(smokers)
## Rows: 780
## Columns: 11
## $ randid <int> 12806, 24721, 33077, 34689, 40435, 45464, 47561, 68194, 68397…
## $ sex <fct> Female, Female, Male, Female, Female, Female, Male, Male, Mal…
## $ age <int> 57, 51, 60, 49, 54, 64, 56, 55, 53, 54, 54, 53, 58, 49, 59, 5…
## $ sysbp <dbl> 110.0, 141.0, 144.5, 163.0, 126.0, 150.0, 132.0, 129.0, 148.0…
## $ diabp <dbl> 46.0, 81.0, 80.0, 96.0, 75.0, 89.0, 70.0, 85.0, 89.0, 111.0, …
## $ cigpday <int> 30, 15, 10, 10, 40, 20, 40, 50, 8, 3, 16, 3, 30, 35, 20, 20, …
## $ heartrte <int> 75, 85, 57, 82, 85, 90, 100, 84, 80, 63, 70, 90, 90, 70, 82, …
## $ prevhyp <fct> Free, Prev, Prev, Prev, Free, Prev, Prev, Free, Prev, Prev, F…
## $ hyperten <fct> Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, F…
## $ timehyp <int> 8679, 4408, 0, 2157, 5933, 2177, 0, 6199, 785, 0, 8766, 8766,…
## $ smkerst <fct> Heavy, Moderate, Moderate, Moderate, Heavy, Moderate, Heavy, …
smokers_descrip <- smokers %>%
group_by(smkerst) %>%
summarise(
N = n(),
Mean_SysBP = mean(sysbp, na.rm=TRUE),
Mean_DiaBP = mean(diabp, na.rm=TRUE),
Mean_Heartrate = mean(heartrte, na.rm=TRUE),
SD_SysBP = sd(sysbp, na.rm=TRUE),
SD_DiaBP = sd(diabp, na.rm=TRUE),
SD_Heartrate = sd(heartrte, na.rm=TRUE),
SE_SysBP = SD_SysBP/sqrt(N),
SE_DiaBP = SD_DiaBP/sqrt(N),
SE_Heartrate = SD_Heartrate/sqrt(N)
)
smokers_descrip
## # A tibble: 3 × 11
## smkerst N Mean_SysBP Mean_DiaBP Mean_Heartrate SD_SysBP SD_DiaBP
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Light 93 135. 79.7 78.3 24.3 11.8
## 2 Moderate 471 133. 79.8 78.8 20.7 11.2
## 3 Heavy 216 138. 81.8 78.4 21.4 10.8
## # ℹ 4 more variables: SD_Heartrate <dbl>, SE_SysBP <dbl>, SE_DiaBP <dbl>,
## # SE_Heartrate <dbl>
bp_long <- smokers %>%
group_by(smkerst) %>%
summarise(
mean_sys = mean(sysbp, na.rm=TRUE),
mean_dia = mean(diabp, na.rm=TRUE),
se_sys = sd(sysbp, na.rm=TRUE)/sqrt(n()),
se_dia = sd(diabp, na.rm=TRUE)/sqrt(n())
) %>%
pivot_longer(cols = starts_with("mean"), names_to="Type", values_to="BP") %>%
mutate(SE = ifelse(Type=="mean_sys", se_sys, se_dia),
Type = recode(Type, mean_sys="Systolic", mean_dia="Diastolic"))
ggplot(bp_long, aes(x=smkerst, y=BP, fill=Type)) +
geom_col(position="dodge") +
geom_errorbar(aes(ymin=BP-SE, ymax=BP+SE), position=position_dodge(.9), width=.25) +
labs(title="Blood Pressure by Smoking Intensity", x="Smoking Status", y="Mean BP (mmHg)") +
theme_minimal()
Mean systolic and diastolic BP by smoking intensity
ggsave("../docs/figures/bp_by_status.png", width=6, height=4)
hr_summary <- smokers %>%
group_by(smkerst, sex) %>%
summarise(
mean_hr = mean(heartrte, na.rm=TRUE),
se_hr = sd(heartrte, na.rm=TRUE)/sqrt(n())
)
ggplot(hr_summary, aes(x=smkerst, y=mean_hr, fill=sex)) +
geom_col(position="dodge") +
geom_errorbar(aes(ymin=mean_hr-se_hr, ymax=mean_hr+se_hr), position=position_dodge(.9), width=.25) +
labs(title="Heart Rate by Smoking Intensity and Sex", x="Smoking Status", y="Mean HR (bpm)") +
theme_minimal()
Average heart rate by smoking intensity and sex
ggsave("../docs/figures/hr_by_sex.png", width=6, height=4)
anova_sys <- aov(sysbp ~ smkerst, data=smokers)
anova_dia <- aov(diabp ~ smkerst, data=smokers)
anova_hr <- aov(heartrte ~ smkerst, data=smokers)
summary(anova_sys)
## Df Sum Sq Mean Sq F value Pr(>F)
## smkerst 2 2808 1404.0 3.079 0.0465 *
## Residuals 777 354256 455.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_dia)
## Df Sum Sq Mean Sq F value Pr(>F)
## smkerst 2 616 307.9 2.468 0.0854 .
## Residuals 777 96942 124.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_hr)
## Df Sum Sq Mean Sq F value Pr(>F)
## smkerst 2 32 16.25 0.102 0.903
## Residuals 777 123304 158.69
# Post-hoc Scheffe tests
ScheffeTest(anova_sys)
##
## Posthoc multiple comparisons of means: Scheffe Test
## 95% family-wise confidence level
##
## $smkerst
## diff lwr.ci upr.ci pval
## Moderate-Light -1.850010 -7.79210892 4.092088 0.7472
## Heavy-Light 2.489471 -4.00528891 8.984232 0.6430
## Heavy-Moderate 4.339482 0.03626435 8.642699 0.0475 *
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ScheffeTest(anova_dia)
##
## Posthoc multiple comparisons of means: Scheffe Test
## 95% family-wise confidence level
##
## $smkerst
## diff lwr.ci upr.ci pval
## Moderate-Light 0.1259845 -2.9824259 3.234395 0.9951
## Heavy-Light 2.0888590 -1.3086579 5.486376 0.3214
## Heavy-Moderate 1.9628745 -0.2882099 4.213959 0.1023
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ScheffeTest(anova_hr)
##
## Posthoc multiple comparisons of means: Scheffe Test
## 95% family-wise confidence level
##
## $smkerst
## diff lwr.ci upr.ci pval
## Moderate-Light 0.46763920 -3.038017 3.973296 0.9479
## Heavy-Light 0.07541816 -3.756292 3.907128 0.9988
## Heavy-Moderate -0.39222104 -2.930988 2.146546 0.9307
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This project demonstrates reproducible data analysis in R, combining data cleaning, descriptive statistics, ANOVA, and visualization. It highlights the cardiovascular impact of tobacco use and showcases skills in data wrangling, visualization, and communication. ```