Using R to Connect to PostgreSQL on Ubuntu
This guide covers connecting R to PostgreSQL databases on Ubuntu systems.
Quick Start (For Existing R Installation)
If you already have R installed and just need to connect to an existing PostgreSQL database:
- Check PostgreSQL client libraries:
libpq-devmay already be installed on your system - Install R packages:
install.packages(c("DBI", "RPostgreSQL")) - Connect: Use
dbConnect()with your database credentials - Query: Remember to use
research.table_namefor the research schema
Check and Install R if Needed
Verify R Installation
# Check if R is installed and its version
R --version
# If R is installed, you should see output like:
# R version 4.x.x (YYYY-MM-DD) -- "Version Name"
Install R (Only if Not Already Installed)
# If R is not installed or you need a newer version:
sudo apt update
sudo apt install r-base
# Verify installation
R --version
Install R PostgreSQL Packages
# Install PostgreSQL connectivity packages
install.packages(c(
"DBI", # Database interface (required)
"RPostgreSQL", # PostgreSQL driver
"keyring" # Secure credential storage (optional)
))
# Alternative modern driver with better performance
install.packages("RPostgres")
Note about Personal Library Installation: If you see warnings like:
Warning: 'lib = "/usr/local/lib/R/site-library"' is not writable
Would you like to use a personal library instead? (yes/No/cancel) yes
Would you like to create a personal library ... to install packages into? (yes/No/cancel) yes
This is normal in shared computing environments. Answer "yes" to both questions. R will create a personal library directory (typically in your home directory like ~/R/x86_64-pc-linux-gnu-library/4.3/) where your packages will be installed. This doesn't affect functionality - R will automatically find and load packages from your personal library.
Check PostgreSQL Client Libraries on Ubuntu
Important Note: PostgreSQL client libraries (libpq-dev) are often already installed on many systems, especially shared computing environments. These provide the necessary drivers for R packages to communicate with PostgreSQL servers.
# Check if PostgreSQL client libraries are already installed
dpkg -l | grep libpq-dev
# If not installed (no output from above command), then install:
sudo apt update
sudo apt install libpq-dev
If you can successfully install the R packages (DBI, RPostgreSQL) without errors, then the client libraries are already available and you don't need to install anything additional.
R PostgreSQL Connection for Research Database
# Install required packages
install.packages(c("DBI", "RPostgreSQL"))
# Load libraries
library(DBI)
library(RPostgreSQL)
# Connect to PostgreSQL research database
con <- dbConnect(RPostgreSQL::PostgreSQL(),
dbname = "your_database_name",
host = "your_server_host",
port = 5432,
user = "your_username",
password = "your_password")
# Test connection
dbGetQuery(con, "SELECT version();")
# IMPORTANT: Research data is in the 'research' schema
# Always use 'research.table_name' unless you've configured a search path
# Example 1: Get ADRC cohort demographics
demographics <- dbGetQuery(con, "
SELECT
person_id,
gender_source_value,
race_source_value,
ethnicity_source_value,
adrc_cohort,
source_system
FROM research.person
WHERE adrc_cohort = 1
LIMIT 10
")
print(demographics)
# Example 2: Get condition names with concepts
conditions <- dbGetQuery(con, "
SELECT
co.person_id,
co.age_at_condition_start,
c.concept_name,
c.vocabulary_id,
co.condition_source_value
FROM research.condition_occurrence co
JOIN research.concept c ON co.condition_concept_id = c.concept_id
WHERE co.age_at_condition_start >= 65
LIMIT 10
")
print(conditions)
# Close connection
dbDisconnect(con)
PostgreSQL Connection Examples
Basic PostgreSQL Connection
library(DBI)
library(RPostgreSQL)
# Connect to PostgreSQL database
con <- dbConnect(RPostgreSQL::PostgreSQL(),
dbname = "your_database_name",
host = "your_server_host",
port = 5432,
user = "your_username",
password = "your_password")
# Test basic connection
dbGetQuery(con, "SELECT current_database(), current_user;")
# CRITICAL: Research data is in 'research' schema
# List available tables in research schema
tables <- dbGetQuery(con, "
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'research'
ORDER BY table_name
")
print(tables)
# Quick research data check
person_count <- dbGetQuery(con, "
SELECT
COUNT(*) as total_people,
SUM(CASE WHEN adrc_cohort = 1 THEN 1 ELSE 0 END) as adrc_cohort_count
FROM research.person
")
print(person_count)
# Close connection
dbDisconnect(con)
Alternative with RPostgres Driver
library(DBI)
library(RPostgres)
# Connect using the modern RPostgres driver (better performance)
con <- dbConnect(RPostgres::Postgres(),
dbname = "your_database_name",
host = "your_server_host",
port = 5432,
user = "your_username",
password = "your_password")
# Query research data with better performance for large datasets
recent_conditions <- dbGetQuery(con, "
SELECT
co.person_id,
co.age_at_condition_start,
c.concept_name,
c.domain_id
FROM research.condition_occurrence co
JOIN research.concept c ON co.condition_concept_id = c.concept_id
WHERE co.age_at_condition_start >= 70
ORDER BY co.age_at_condition_start DESC
LIMIT 100
")
print(head(recent_conditions))
# Close connection
dbDisconnect(con)
Research Database Query Examples
Example 1: Patient Demographics Analysis
# ADRC cohort demographics with condition counts
demographics_summary <- dbGetQuery(con, "
SELECT
p.gender_source_value,
p.race_source_value,
COUNT(DISTINCT p.person_id) as patient_count,
COUNT(DISTINCT co.condition_occurrence_id) as total_conditions,
ROUND(AVG(co.age_at_condition_start), 1) as avg_age_at_first_condition
FROM research.person p
LEFT JOIN research.condition_occurrence co ON p.person_id = co.person_id
WHERE p.adrc_cohort = 1
GROUP BY p.gender_source_value, p.race_source_value
ORDER BY patient_count DESC
")
Example 2: Medication Analysis
# Most common medications in ADRC cohort
common_meds <- dbGetQuery(con, "
SELECT
c.concept_name as medication_name,
c.vocabulary_id,
COUNT(DISTINCT de.person_id) as patient_count,
COUNT(*) as total_exposures,
ROUND(AVG(de.age_at_drug_start), 1) as avg_start_age
FROM research.drug_exposure de
JOIN research.concept c ON de.drug_concept_id = c.concept_id
JOIN research.person p ON de.person_id = p.person_id
WHERE p.adrc_cohort = 1
AND c.domain_id = 'Drug'
AND c.vocabulary_id IN ('RxNorm', 'NDC')
GROUP BY c.concept_name, c.vocabulary_id
HAVING COUNT(DISTINCT de.person_id) >= 10
ORDER BY patient_count DESC
LIMIT 20
")
Example 3: Lab Values Analysis
# Recent lab values with normal ranges
lab_values <- dbGetQuery(con, "
SELECT
c.concept_name as lab_name,
m.value_as_number,
u.concept_name as unit,
m.range_low,
m.range_high,
CASE
WHEN m.value_as_number < m.range_low THEN 'Low'
WHEN m.value_as_number > m.range_high THEN 'High'
ELSE 'Normal'
END as result_flag,
m.age_at_measurement
FROM research.measurement m
JOIN research.concept c ON m.measurement_concept_id = c.concept_id
LEFT JOIN research.concept u ON m.unit_concept_id = u.concept_id
JOIN research.person p ON m.person_id = p.person_id
WHERE p.adrc_cohort = 1
AND m.value_as_number IS NOT NULL
AND m.age_at_measurement >= 65
AND c.concept_name ILIKE '%cholesterol%'
ORDER BY m.age_at_measurement DESC
LIMIT 100
")
Example 4: Temporal Analysis
# Age progression of conditions
condition_timeline <- dbGetQuery(con, "
SELECT
co.person_id,
c.concept_name as condition_name,
co.age_at_condition_start,
CASE
WHEN co.age_at_condition_start < 65 THEN 'Under 65'
WHEN co.age_at_condition_start BETWEEN 65 AND 74 THEN '65-74'
WHEN co.age_at_condition_start BETWEEN 75 AND 84 THEN '75-84'
ELSE '85+'
END as age_group
FROM research.condition_occurrence co
JOIN research.concept c ON co.condition_concept_id = c.concept_id
JOIN research.person p ON co.person_id = p.person_id
WHERE p.adrc_cohort = 1
AND c.domain_id = 'Condition'
AND c.concept_name ILIKE '%dementia%'
ORDER BY co.person_id, co.age_at_condition_start
")
Example 5: Mortality Analysis
# Survival analysis data
survival_data <- dbGetQuery(con, "
SELECT
p.person_id,
p.gender_source_value,
d.age_at_death,
dc.concept_name as cause_of_death,
CASE WHEN d.person_id IS NOT NULL THEN 1 ELSE 0 END as deceased
FROM research.person p
LEFT JOIN research.death d ON p.person_id = d.person_id
LEFT JOIN research.concept dc ON d.cause_concept_id = dc.concept_id
WHERE p.adrc_cohort = 1
ORDER BY p.person_id
")
### Schema Configuration (Optional)
```r
# Set search path to avoid typing 'research.' every time
dbExecute(con, "SET search_path TO research, public;")
# Now you can query without schema prefix
simple_query <- dbGetQuery(con, "
SELECT COUNT(*) as total_patients
FROM person
WHERE adrc_cohort = 1
")
# NOTE: This setting only lasts for your current connection
# You'll need to set it again each time you reconnect
Important Notes for Research Schema
Schema Prefix Requirements
- Always use
research.table_nameunless you've set the search path - Available tables:
person,condition_occurrence,drug_exposure,measurement,observation,procedure_occurrence,visit_occurrence,death,concept - The
concepttable is essential for translating concept IDs to readable names
Key Fields to Remember
- Age fields: All temporal data uses
age_at_*instead of dates - ADRC cohort: Filter with
adrc_cohort = 1for main research population - Concept joins: Always join to
research.conceptto get readable names - Standard concepts: Use
standard_concept = 'S'for preferred terminology
Secure Connection Management
Using Environment Variables
# Set environment variables in ~/.bashrc or ~/.profile
export DB_HOST="your_server_host"
export DB_USER="your_username"
export DB_PASSWORD="your_password"
export DB_NAME="your_database_name"
# Use environment variables for secure connection
con <- dbConnect(RPostgreSQL::PostgreSQL(),
host = Sys.getenv("DB_HOST"),
user = Sys.getenv("DB_USER"),
password = Sys.getenv("DB_PASSWORD"),
dbname = Sys.getenv("DB_NAME"),
port = 5432)
Using Configuration Files
# Create config.R file
db_config <- list(
host = "your_server_host",
port = 5432,
dbname = "your_database_name",
user = "your_username"
)
# Use config file (password prompt for security)
source("config.R")
con <- dbConnect(RPostgreSQL::PostgreSQL(),
host = db_config$host,
port = db_config$port,
dbname = db_config$dbname,
user = db_config$user,
password = rstudioapi::askForPassword("Database password"))
Common Research Database Operations
Reading Research Data
# Read specific research tables
person_data <- dbReadTable(con, "research.person")
# Execute research-specific queries
condition_summary <- dbGetQuery(con, "
SELECT
c.concept_name,
COUNT(DISTINCT co.person_id) as patient_count,
ROUND(AVG(co.age_at_condition_start), 1) as avg_age
FROM research.condition_occurrence co
JOIN research.concept c ON co.condition_concept_id = c.concept_id
JOIN research.person p ON co.person_id = p.person_id
WHERE p.adrc_cohort = 1
GROUP BY c.concept_name
ORDER BY patient_count DESC
")
# Read in chunks for large research datasets
res <- dbSendQuery(con, "SELECT * FROM research.measurement WHERE age_at_measurement >= 65")
while (!dbHasCompleted(res)) {
chunk <- dbFetch(res, n = 5000) # Process 5000 rows at a time
# Analyze chunk
print(paste("Processed", nrow(chunk), "measurements"))
}
dbClearResult(res)
Research Data Analysis Patterns
# Note: Research schema is read-only for analysis
# No INSERT/UPDATE/DELETE operations allowed
# Common pattern: Join with concept table for readable names
readable_data <- dbGetQuery(con, "
SELECT
de.person_id,
de.age_at_drug_start,
c.concept_name as drug_name,
de.days_supply
FROM research.drug_exposure de
JOIN research.concept c ON de.drug_concept_id = c.concept_id
WHERE de.age_at_drug_start >= 60
")
Performance Tips for Research Data
Chunked Reading for Large Tables
# For large measurement or condition tables
process_large_table <- function(table_name, chunk_size = 10000) {
query <- paste("SELECT * FROM research.", table_name, " WHERE adrc_cohort = 1", sep="")
res <- dbSendQuery(con, query)
while (!dbHasCompleted(res)) {
chunk <- dbFetch(res, n = chunk_size)
# Process your chunk here
cat("Processed", nrow(chunk), "rows\n")
}
dbClearResult(res)
}
Optimized Research Queries
# Use indexes efficiently - filter by person_id first
efficient_query <- dbGetQuery(con, "
SELECT co.*, c.concept_name
FROM research.condition_occurrence co
JOIN research.concept c ON co.condition_concept_id = c.concept_id
WHERE co.person_id IN (
SELECT person_id FROM research.person WHERE adrc_cohort = 1
)
AND co.age_at_condition_start >= 65
")
# Measure query performance
system.time({
result <- dbGetQuery(con, "SELECT COUNT(*) FROM research.condition_occurrence")
})
Troubleshooting Common Issues
Package Installation Problems
# Check if PostgreSQL client libraries are installed
dpkg -l | grep libpq-dev
# Only install if missing (no output from above command)
sudo apt install libpq-dev build-essential
# If RPostgreSQL fails to install
install.packages("RPostgreSQL", type = "source")
Note: The libpq-dev package contains PostgreSQL client libraries that R packages need to compile and connect to PostgreSQL databases. Many systems already have these installed. If your R package installation succeeds without errors, you likely don't need to install additional system dependencies.
Connection Problems
# Test connection with error handling
tryCatch({
con <- dbConnect(RPostgreSQL::PostgreSQL(),
dbname = "your_database",
host = "your_host",
port = 5432,
user = "your_user",
password = "your_password")
# Test basic query
result <- dbGetQuery(con, "SELECT current_user, current_database()")
print("Connection successful!")
print(result)
dbDisconnect(con)
}, error = function(e) {
print(paste("Connection failed:", e$message))
})
Research Schema Access Issues
# Verify research schema exists and you have access
schema_check <- dbGetQuery(con, "
SELECT
schemaname,
tablename
FROM pg_tables
WHERE schemaname = 'research'
ORDER BY tablename
")
if(nrow(schema_check) == 0) {
print("Research schema not found or no access")
} else {
print("Available research tables:")
print(schema_check$tablename)
}
Best Practices for Research Data
- Always close connections: Use
dbDisconnect()when finished - Filter by ADRC cohort: Use
WHERE adrc_cohort = 1for main research population - Use schema prefix: Always specify
research.table_name - Join with concepts: Always join to
research.conceptfor readable names - Handle large datasets: Use chunked reading for measurements and observations
- Monitor performance: Use
system.time()for query optimization
# Example research query with best practices
research_analysis <- function(con) {
tryCatch({
# Filter ADRC cohort and join with concepts
result <- dbGetQuery(con, "
SELECT
p.person_id,
p.gender_source_value,
co.age_at_condition_start,
c.concept_name as condition_name
FROM research.person p
JOIN research.condition_occurrence co ON p.person_id = co.person_id
JOIN research.concept c ON co.condition_concept_id = c.concept_id
WHERE p.adrc_cohort = 1
AND co.age_at_condition_start >= 65
ORDER BY co.age_at_condition_start
")
return(result)
}, error = function(e) {
message("Query failed: ", e$message)
return(NULL)
}, finally = {
dbDisconnect(con)
})
}
Additional Resources
R PostgreSQL Packages
- DBI Package Documentation - Database interface standards
- RPostgreSQL Documentation - PostgreSQL driver
- RPostgres Documentation - Modern PostgreSQL driver
OMOP and Research Data
- OMOP Common Data Model - Official OMOP CDM documentation
- Athena Vocabulary Browser - Search OMOP concepts and vocabularies
- OHDSI Tools - Additional analysis tools for OMOP data