Introduction

This analysis explores the relationship between tobacco use and cardiovascular health indicators (blood pressure and heart rate). The dataset includes measures of systolic BP, diastolic BP, and HR across smoking intensity and sex.

Data Import & Cleaning

# Load dataset
heart <- read.csv("../data/dataset.csv")

# Select relevant variables
heart <- heart %>%
  select(randid, sex, age, sysbp, diabp, cigpday, heartrte, prevhyp, hyperten, timehyp)

# Filter for smokers only
smokers <- heart %>% filter(cigpday >= 1)

# Create smoker intensity groups
smokers <- smokers %>%
  mutate(
    smkerst = case_when(
      cigpday <= 5 ~ "Light",
      cigpday >= 6 & cigpday <= 20 ~ "Moderate",
      cigpday >= 21 ~ "Heavy"
    ),
    sex = factor(sex, labels = c("Male", "Female")),
    prevhyp = factor(prevhyp, labels = c("Free", "Prev")),
    hyperten = factor(hyperten, labels = c("Free", "Prev")),
    smkerst = factor(smkerst, levels = c("Light", "Moderate", "Heavy"))
  )

glimpse(smokers)
## Rows: 780
## Columns: 11
## $ randid   <int> 12806, 24721, 33077, 34689, 40435, 45464, 47561, 68194, 68397…
## $ sex      <fct> Female, Female, Male, Female, Female, Female, Male, Male, Mal…
## $ age      <int> 57, 51, 60, 49, 54, 64, 56, 55, 53, 54, 54, 53, 58, 49, 59, 5…
## $ sysbp    <dbl> 110.0, 141.0, 144.5, 163.0, 126.0, 150.0, 132.0, 129.0, 148.0…
## $ diabp    <dbl> 46.0, 81.0, 80.0, 96.0, 75.0, 89.0, 70.0, 85.0, 89.0, 111.0, …
## $ cigpday  <int> 30, 15, 10, 10, 40, 20, 40, 50, 8, 3, 16, 3, 30, 35, 20, 20, …
## $ heartrte <int> 75, 85, 57, 82, 85, 90, 100, 84, 80, 63, 70, 90, 90, 70, 82, …
## $ prevhyp  <fct> Free, Prev, Prev, Prev, Free, Prev, Prev, Free, Prev, Prev, F…
## $ hyperten <fct> Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, Prev, F…
## $ timehyp  <int> 8679, 4408, 0, 2157, 5933, 2177, 0, 6199, 785, 0, 8766, 8766,…
## $ smkerst  <fct> Heavy, Moderate, Moderate, Moderate, Heavy, Moderate, Heavy, …

Descriptive Statistics

smokers_descrip <- smokers %>%
  group_by(smkerst) %>%
  summarise(
    N = n(),
    Mean_SysBP = mean(sysbp, na.rm=TRUE),
    Mean_DiaBP = mean(diabp, na.rm=TRUE),
    Mean_Heartrate = mean(heartrte, na.rm=TRUE),
    SD_SysBP = sd(sysbp, na.rm=TRUE),
    SD_DiaBP = sd(diabp, na.rm=TRUE),
    SD_Heartrate = sd(heartrte, na.rm=TRUE),
    SE_SysBP = SD_SysBP/sqrt(N),
    SE_DiaBP = SD_DiaBP/sqrt(N),
    SE_Heartrate = SD_Heartrate/sqrt(N)
  )

smokers_descrip
## # A tibble: 3 × 11
##   smkerst      N Mean_SysBP Mean_DiaBP Mean_Heartrate SD_SysBP SD_DiaBP
##   <fct>    <int>      <dbl>      <dbl>          <dbl>    <dbl>    <dbl>
## 1 Light       93       135.       79.7           78.3     24.3     11.8
## 2 Moderate   471       133.       79.8           78.8     20.7     11.2
## 3 Heavy      216       138.       81.8           78.4     21.4     10.8
## # ℹ 4 more variables: SD_Heartrate <dbl>, SE_SysBP <dbl>, SE_DiaBP <dbl>,
## #   SE_Heartrate <dbl>

Blood Pressure by Smoking Intensity

bp_long <- smokers %>%
  group_by(smkerst) %>%
  summarise(
    mean_sys = mean(sysbp, na.rm=TRUE),
    mean_dia = mean(diabp, na.rm=TRUE),
    se_sys = sd(sysbp, na.rm=TRUE)/sqrt(n()),
    se_dia = sd(diabp, na.rm=TRUE)/sqrt(n())
  ) %>%
  pivot_longer(cols = starts_with("mean"), names_to="Type", values_to="BP") %>%
  mutate(SE = ifelse(Type=="mean_sys", se_sys, se_dia),
         Type = recode(Type, mean_sys="Systolic", mean_dia="Diastolic"))

ggplot(bp_long, aes(x=smkerst, y=BP, fill=Type)) +
  geom_col(position="dodge") +
  geom_errorbar(aes(ymin=BP-SE, ymax=BP+SE), position=position_dodge(.9), width=.25) +
  labs(title="Blood Pressure by Smoking Intensity", x="Smoking Status", y="Mean BP (mmHg)") +
  theme_minimal()
Mean systolic and diastolic BP by smoking intensity

Mean systolic and diastolic BP by smoking intensity

ggsave("../docs/figures/bp_by_status.png", width=6, height=4)

Heart Rate by Smoking Intensity and Sex

hr_summary <- smokers %>%
  group_by(smkerst, sex) %>%
  summarise(
    mean_hr = mean(heartrte, na.rm=TRUE),
    se_hr = sd(heartrte, na.rm=TRUE)/sqrt(n())
  )

ggplot(hr_summary, aes(x=smkerst, y=mean_hr, fill=sex)) +
  geom_col(position="dodge") +
  geom_errorbar(aes(ymin=mean_hr-se_hr, ymax=mean_hr+se_hr), position=position_dodge(.9), width=.25) +
  labs(title="Heart Rate by Smoking Intensity and Sex", x="Smoking Status", y="Mean HR (bpm)") +
  theme_minimal()
Average heart rate by smoking intensity and sex

Average heart rate by smoking intensity and sex

ggsave("../docs/figures/hr_by_sex.png", width=6, height=4)

Statistical Testing

anova_sys <- aov(sysbp ~ smkerst, data=smokers)
anova_dia <- aov(diabp ~ smkerst, data=smokers)
anova_hr  <- aov(heartrte ~ smkerst, data=smokers)

summary(anova_sys)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## smkerst       2   2808  1404.0   3.079 0.0465 *
## Residuals   777 354256   455.9                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_dia)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## smkerst       2    616   307.9   2.468 0.0854 .
## Residuals   777  96942   124.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(anova_hr)
##              Df Sum Sq Mean Sq F value Pr(>F)
## smkerst       2     32   16.25   0.102  0.903
## Residuals   777 123304  158.69
# Post-hoc Scheffe tests
ScheffeTest(anova_sys)
## 
##   Posthoc multiple comparisons of means: Scheffe Test 
##     95% family-wise confidence level
## 
## $smkerst
##                     diff      lwr.ci   upr.ci   pval    
## Moderate-Light -1.850010 -7.79210892 4.092088 0.7472    
## Heavy-Light     2.489471 -4.00528891 8.984232 0.6430    
## Heavy-Moderate  4.339482  0.03626435 8.642699 0.0475 *  
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ScheffeTest(anova_dia)
## 
##   Posthoc multiple comparisons of means: Scheffe Test 
##     95% family-wise confidence level
## 
## $smkerst
##                     diff     lwr.ci   upr.ci   pval    
## Moderate-Light 0.1259845 -2.9824259 3.234395 0.9951    
## Heavy-Light    2.0888590 -1.3086579 5.486376 0.3214    
## Heavy-Moderate 1.9628745 -0.2882099 4.213959 0.1023    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ScheffeTest(anova_hr)
## 
##   Posthoc multiple comparisons of means: Scheffe Test 
##     95% family-wise confidence level
## 
## $smkerst
##                       diff    lwr.ci   upr.ci   pval    
## Moderate-Light  0.46763920 -3.038017 3.973296 0.9479    
## Heavy-Light     0.07541816 -3.756292 3.907128 0.9988    
## Heavy-Moderate -0.39222104 -2.930988 2.146546 0.9307    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Discussion

Conclusion

This project demonstrates reproducible data analysis in R, combining data cleaning, descriptive statistics, ANOVA, and visualization. It highlights the cardiovascular impact of tobacco use and showcases skills in data wrangling, visualization, and communication. ```