Using R to Connect to PostgreSQL on Ubuntu

This guide covers connecting R to PostgreSQL databases on Ubuntu systems.

Quick Start (For Existing R Installation)

If you already have R installed and just need to connect to an existing PostgreSQL database:

Check PostgreSQL client libraries: libpq-dev may already be installed on your system
Install R packages: install.packages(c("DBI", "RPostgreSQL"))
Connect: Use dbConnect() with your database credentials
Query: Remember to use research.table_name for the research schema

Check and Install R if Needed

Verify R Installation

# Check if R is installed and its version
R --version

# If R is installed, you should see output like:
# R version 4.x.x (YYYY-MM-DD) -- "Version Name"

Install R (Only if Not Already Installed)

# If R is not installed or you need a newer version:
sudo apt update
sudo apt install r-base 

# Verify installation
R --version

Install R PostgreSQL Packages

# Install PostgreSQL connectivity packages
install.packages(c(
  "DBI",         # Database interface (required)
  "RPostgreSQL", # PostgreSQL driver
  "keyring"      # Secure credential storage (optional)
))

# Alternative modern driver with better performance
install.packages("RPostgres")

Note about Personal Library Installation: If you see warnings like:

Warning: 'lib = "/usr/local/lib/R/site-library"' is not writable
Would you like to use a personal library instead? (yes/No/cancel) yes
Would you like to create a personal library ... to install packages into? (yes/No/cancel) yes

This is normal in shared computing environments. Answer "yes" to both questions. R will create a personal library directory (typically in your home directory like ~/R/x86_64-pc-linux-gnu-library/4.3/) where your packages will be installed. This doesn't affect functionality - R will automatically find and load packages from your personal library.

Check PostgreSQL Client Libraries on Ubuntu

Important Note: PostgreSQL client libraries (libpq-dev) are often already installed on many systems, especially shared computing environments. These provide the necessary drivers for R packages to communicate with PostgreSQL servers.

# Check if PostgreSQL client libraries are already installed
dpkg -l | grep libpq-dev

# If not installed (no output from above command), then install:
sudo apt update
sudo apt install libpq-dev

If you can successfully install the R packages (DBI, RPostgreSQL) without errors, then the client libraries are already available and you don't need to install anything additional.

R PostgreSQL Connection for Research Database

# Install required packages
install.packages(c("DBI", "RPostgreSQL"))

# Load libraries
library(DBI)
library(RPostgreSQL)

# Connect to PostgreSQL research database
con <- dbConnect(RPostgreSQL::PostgreSQL(),
                 dbname = "your_database_name",
                 host = "your_server_host",
                 port = 5432,
                 user = "your_username",
                 password = "your_password")

# Test connection
dbGetQuery(con, "SELECT version();")

# IMPORTANT: Research data is in the 'research' schema
# Always use 'research.table_name' unless you've configured a search path

# Example 1: Get ADRC cohort demographics
demographics <- dbGetQuery(con, "
  SELECT 
    person_id,
    gender_source_value,
    race_source_value,
    ethnicity_source_value,
    adrc_cohort,
    source_system
  FROM research.person 
  WHERE adrc_cohort = 1
  LIMIT 10
")
print(demographics)

# Example 2: Get condition names with concepts
conditions <- dbGetQuery(con, "
  SELECT 
    co.person_id,
    co.age_at_condition_start,
    c.concept_name,
    c.vocabulary_id,
    co.condition_source_value
  FROM research.condition_occurrence co
  JOIN research.concept c ON co.condition_concept_id = c.concept_id
  WHERE co.age_at_condition_start >= 65
  LIMIT 10
")
print(conditions)

# Close connection
dbDisconnect(con)

PostgreSQL Connection Examples

Basic PostgreSQL Connection

library(DBI)
library(RPostgreSQL)

# Connect to PostgreSQL database
con <- dbConnect(RPostgreSQL::PostgreSQL(),
                 dbname = "your_database_name",
                 host = "your_server_host",
                 port = 5432,
                 user = "your_username",
                 password = "your_password")

# Test basic connection
dbGetQuery(con, "SELECT current_database(), current_user;")

# CRITICAL: Research data is in 'research' schema
# List available tables in research schema
tables <- dbGetQuery(con, "
  SELECT table_name 
  FROM information_schema.tables 
  WHERE table_schema = 'research'
  ORDER BY table_name
")
print(tables)

# Quick research data check
person_count <- dbGetQuery(con, "
  SELECT 
    COUNT(*) as total_people,
    SUM(CASE WHEN adrc_cohort = 1 THEN 1 ELSE 0 END) as adrc_cohort_count
  FROM research.person
")
print(person_count)

# Close connection
dbDisconnect(con)

Alternative with RPostgres Driver

library(DBI)
library(RPostgres)

# Connect using the modern RPostgres driver (better performance)
con <- dbConnect(RPostgres::Postgres(),
                 dbname = "your_database_name",
                 host = "your_server_host",
                 port = 5432,
                 user = "your_username",
                 password = "your_password")

# Query research data with better performance for large datasets
recent_conditions <- dbGetQuery(con, "
  SELECT 
    co.person_id,
    co.age_at_condition_start,
    c.concept_name,
    c.domain_id
  FROM research.condition_occurrence co
  JOIN research.concept c ON co.condition_concept_id = c.concept_id
  WHERE co.age_at_condition_start >= 70
  ORDER BY co.age_at_condition_start DESC
  LIMIT 100
")
print(head(recent_conditions))

# Close connection
dbDisconnect(con)

Research Database Query Examples

Example 1: Patient Demographics Analysis

# ADRC cohort demographics with condition counts
demographics_summary <- dbGetQuery(con, "
  SELECT 
    p.gender_source_value,
    p.race_source_value,
    COUNT(DISTINCT p.person_id) as patient_count,
    COUNT(DISTINCT co.condition_occurrence_id) as total_conditions,
    ROUND(AVG(co.age_at_condition_start), 1) as avg_age_at_first_condition
  FROM research.person p
  LEFT JOIN research.condition_occurrence co ON p.person_id = co.person_id
  WHERE p.adrc_cohort = 1
  GROUP BY p.gender_source_value, p.race_source_value
  ORDER BY patient_count DESC
")

Example 2: Medication Analysis

# Most common medications in ADRC cohort
common_meds <- dbGetQuery(con, "
  SELECT 
    c.concept_name as medication_name,
    c.vocabulary_id,
    COUNT(DISTINCT de.person_id) as patient_count,
    COUNT(*) as total_exposures,
    ROUND(AVG(de.age_at_drug_start), 1) as avg_start_age
  FROM research.drug_exposure de
  JOIN research.concept c ON de.drug_concept_id = c.concept_id
  JOIN research.person p ON de.person_id = p.person_id
  WHERE p.adrc_cohort = 1
    AND c.domain_id = 'Drug'
    AND c.vocabulary_id IN ('RxNorm', 'NDC')
  GROUP BY c.concept_name, c.vocabulary_id
  HAVING COUNT(DISTINCT de.person_id) >= 10
  ORDER BY patient_count DESC
  LIMIT 20
")

Example 3: Lab Values Analysis

# Recent lab values with normal ranges
lab_values <- dbGetQuery(con, "
  SELECT 
    c.concept_name as lab_name,
    m.value_as_number,
    u.concept_name as unit,
    m.range_low,
    m.range_high,
    CASE 
      WHEN m.value_as_number < m.range_low THEN 'Low'
      WHEN m.value_as_number > m.range_high THEN 'High'
      ELSE 'Normal'
    END as result_flag,
    m.age_at_measurement
  FROM research.measurement m
  JOIN research.concept c ON m.measurement_concept_id = c.concept_id
  LEFT JOIN research.concept u ON m.unit_concept_id = u.concept_id
  JOIN research.person p ON m.person_id = p.person_id
  WHERE p.adrc_cohort = 1
    AND m.value_as_number IS NOT NULL
    AND m.age_at_measurement >= 65
    AND c.concept_name ILIKE '%cholesterol%'
  ORDER BY m.age_at_measurement DESC
  LIMIT 100
")

Example 4: Temporal Analysis

# Age progression of conditions
condition_timeline <- dbGetQuery(con, "
  SELECT 
    co.person_id,
    c.concept_name as condition_name,
    co.age_at_condition_start,
    CASE 
      WHEN co.age_at_condition_start < 65 THEN 'Under 65'
      WHEN co.age_at_condition_start BETWEEN 65 AND 74 THEN '65-74'
      WHEN co.age_at_condition_start BETWEEN 75 AND 84 THEN '75-84'
      ELSE '85+'
    END as age_group
  FROM research.condition_occurrence co
  JOIN research.concept c ON co.condition_concept_id = c.concept_id
  JOIN research.person p ON co.person_id = p.person_id
  WHERE p.adrc_cohort = 1
    AND c.domain_id = 'Condition'
    AND c.concept_name ILIKE '%dementia%'
  ORDER BY co.person_id, co.age_at_condition_start
")

Example 5: Mortality Analysis

# Survival analysis data
survival_data <- dbGetQuery(con, "
  SELECT 
    p.person_id,
    p.gender_source_value,
    d.age_at_death,
    dc.concept_name as cause_of_death,
    CASE WHEN d.person_id IS NOT NULL THEN 1 ELSE 0 END as deceased
  FROM research.person p
  LEFT JOIN research.death d ON p.person_id = d.person_id
  LEFT JOIN research.concept dc ON d.cause_concept_id = dc.concept_id
  WHERE p.adrc_cohort = 1
  ORDER BY p.person_id
")

### Schema Configuration (Optional)
```r
# Set search path to avoid typing 'research.' every time
dbExecute(con, "SET search_path TO research, public;")

# Now you can query without schema prefix
simple_query <- dbGetQuery(con, "
  SELECT COUNT(*) as total_patients 
  FROM person 
  WHERE adrc_cohort = 1
")

# NOTE: This setting only lasts for your current connection
# You'll need to set it again each time you reconnect

Important Notes for Research Schema

Schema Prefix Requirements

Always use research.table_name unless you've set the search path
Available tables: person, condition_occurrence, drug_exposure, measurement, observation, procedure_occurrence, visit_occurrence, death, concept
The concept table is essential for translating concept IDs to readable names

Key Fields to Remember

Age fields: All temporal data uses age_at_* instead of dates
ADRC cohort: Filter with adrc_cohort = 1 for main research population
Concept joins: Always join to research.concept to get readable names
Standard concepts: Use standard_concept = 'S' for preferred terminology

Secure Connection Management

Using Environment Variables

# Set environment variables in ~/.bashrc or ~/.profile
export DB_HOST="your_server_host"
export DB_USER="your_username"
export DB_PASSWORD="your_password"
export DB_NAME="your_database_name"

# Use environment variables for secure connection
con <- dbConnect(RPostgreSQL::PostgreSQL(),
                 host = Sys.getenv("DB_HOST"),
                 user = Sys.getenv("DB_USER"),
                 password = Sys.getenv("DB_PASSWORD"),
                 dbname = Sys.getenv("DB_NAME"),
                 port = 5432)

Using Configuration Files

# Create config.R file
db_config <- list(
  host = "your_server_host",
  port = 5432,
  dbname = "your_database_name",
  user = "your_username"
)

# Use config file (password prompt for security)
source("config.R")
con <- dbConnect(RPostgreSQL::PostgreSQL(),
                 host = db_config$host,
                 port = db_config$port,
                 dbname = db_config$dbname,
                 user = db_config$user,
                 password = rstudioapi::askForPassword("Database password"))

Common Research Database Operations

Reading Research Data

# Read specific research tables
person_data <- dbReadTable(con, "research.person")

# Execute research-specific queries
condition_summary <- dbGetQuery(con, "
  SELECT 
    c.concept_name,
    COUNT(DISTINCT co.person_id) as patient_count,
    ROUND(AVG(co.age_at_condition_start), 1) as avg_age
  FROM research.condition_occurrence co
  JOIN research.concept c ON co.condition_concept_id = c.concept_id
  JOIN research.person p ON co.person_id = p.person_id
  WHERE p.adrc_cohort = 1
  GROUP BY c.concept_name
  ORDER BY patient_count DESC
")

# Read in chunks for large research datasets
res <- dbSendQuery(con, "SELECT * FROM research.measurement WHERE age_at_measurement >= 65")
while (!dbHasCompleted(res)) {
  chunk <- dbFetch(res, n = 5000)  # Process 5000 rows at a time
  # Analyze chunk
  print(paste("Processed", nrow(chunk), "measurements"))
}
dbClearResult(res)

Research Data Analysis Patterns

# Note: Research schema is read-only for analysis
# No INSERT/UPDATE/DELETE operations allowed

# Common pattern: Join with concept table for readable names
readable_data <- dbGetQuery(con, "
  SELECT 
    de.person_id,
    de.age_at_drug_start,
    c.concept_name as drug_name,
    de.days_supply
  FROM research.drug_exposure de
  JOIN research.concept c ON de.drug_concept_id = c.concept_id
  WHERE de.age_at_drug_start >= 60
")

Performance Tips for Research Data

Chunked Reading for Large Tables

# For large measurement or condition tables
process_large_table <- function(table_name, chunk_size = 10000) {
  query <- paste("SELECT * FROM research.", table_name, " WHERE adrc_cohort = 1", sep="")
  res <- dbSendQuery(con, query)

  while (!dbHasCompleted(res)) {
    chunk <- dbFetch(res, n = chunk_size)
    # Process your chunk here
    cat("Processed", nrow(chunk), "rows\n")
  }
  dbClearResult(res)
}

Optimized Research Queries

# Use indexes efficiently - filter by person_id first
efficient_query <- dbGetQuery(con, "
  SELECT co.*, c.concept_name
  FROM research.condition_occurrence co
  JOIN research.concept c ON co.condition_concept_id = c.concept_id
  WHERE co.person_id IN (
    SELECT person_id FROM research.person WHERE adrc_cohort = 1
  )
  AND co.age_at_condition_start >= 65
")

# Measure query performance
system.time({
  result <- dbGetQuery(con, "SELECT COUNT(*) FROM research.condition_occurrence")
})

Troubleshooting Common Issues

Package Installation Problems

# Check if PostgreSQL client libraries are installed
dpkg -l | grep libpq-dev

# Only install if missing (no output from above command)
sudo apt install libpq-dev build-essential

# If RPostgreSQL fails to install
install.packages("RPostgreSQL", type = "source")

Note: The libpq-dev package contains PostgreSQL client libraries that R packages need to compile and connect to PostgreSQL databases. Many systems already have these installed. If your R package installation succeeds without errors, you likely don't need to install additional system dependencies.

Connection Problems

# Test connection with error handling
tryCatch({
  con <- dbConnect(RPostgreSQL::PostgreSQL(),
                   dbname = "your_database",
                   host = "your_host",
                   port = 5432,
                   user = "your_user",
                   password = "your_password")

  # Test basic query
  result <- dbGetQuery(con, "SELECT current_user, current_database()")
  print("Connection successful!")
  print(result)

  dbDisconnect(con)
}, error = function(e) {
  print(paste("Connection failed:", e$message))
})

Research Schema Access Issues

# Verify research schema exists and you have access
schema_check <- dbGetQuery(con, "
  SELECT 
    schemaname,
    tablename
  FROM pg_tables 
  WHERE schemaname = 'research'
  ORDER BY tablename
")

if(nrow(schema_check) == 0) {
  print("Research schema not found or no access")
} else {
  print("Available research tables:")
  print(schema_check$tablename)
}

Best Practices for Research Data

Always close connections: Use dbDisconnect() when finished
Filter by ADRC cohort: Use WHERE adrc_cohort = 1 for main research population
Use schema prefix: Always specify research.table_name
Join with concepts: Always join to research.concept for readable names
Handle large datasets: Use chunked reading for measurements and observations
Monitor performance: Use system.time() for query optimization

# Example research query with best practices
research_analysis <- function(con) {
  tryCatch({
    # Filter ADRC cohort and join with concepts
    result <- dbGetQuery(con, "
      SELECT 
        p.person_id,
        p.gender_source_value,
        co.age_at_condition_start,
        c.concept_name as condition_name
      FROM research.person p
      JOIN research.condition_occurrence co ON p.person_id = co.person_id
      JOIN research.concept c ON co.condition_concept_id = c.concept_id
      WHERE p.adrc_cohort = 1
        AND co.age_at_condition_start >= 65
      ORDER BY co.age_at_condition_start
    ")

    return(result)

  }, error = function(e) {
    message("Query failed: ", e$message)
    return(NULL)
  }, finally = {
    dbDisconnect(con)
  })
}