Skip to content

PRS processing

Requirements

  • Ubuntu 22.04 (8 CPUs, 32 GB)
  • bcftools (version==1.13)
  • plink v1.90
  • PRSice-2 v2.3.3
  • R v4.2.2
    • data.table (version==1.17.8)
    • ggplot2 (version==3.5.2)
    • scales (version==1.4.0)

Input data

  • restructed lpWGS VCFs
  • restructed SNP-array VCFs
  • True VCFs
  • Base sumstats

The PRS processing scripts were developed with reference to the tutorial provided by Choi et al.1 , which served as a foundational guide.

PRS processing workflow

Imputed VCF files were first merged using bcftools combine and annotated with reference information using bcftools annotate. The annotated VCFs were then converted to PLINK binary format (BED) and processed for quality control (QC) and duplicate removal using PLINK. PRSice was then used to calculate polygenic risk scores, using GWAS summary statistics and linkage disequilibrium (LD) reference panels as inputs. The final output consisted of individual-level PRS scores.

Summary statistic

The base data (GWAS summary statistics) for the selected phenotypes must undergo a rigorous quality control (QC) process to ensure its reliability.

GWAS Summary Statistics Sources

The specific GWAS summary statistics used for each phenotype were obtained from the GWAS Catalog:

Phenotype GWAS Catalog ID Source
Height GCST006901 Yengo et al. (2018)
BMI GCST006900 Yengo et al. (2018)
Type 2 Diabetes GCST006867 Xue et al. (2018)
Metabolic Disorder GCST90444487 Park et al. (2024)

Correct sample name

Ensure that sample names do not contain underscores, as these may be introduced during the merging of imputed VCF files. In such cases, the filename used during merging may be incorporated into the sample name to maintain uniqueness across datasets.

Code

set -ue

VCF_FILES=$1

# Get correct name
bcftools query -l $VCF_FILES > sample_name.txt
sed -E 's/^(([^_]+)_.*)/\1\t\2/g' sample_name.txt > new_name.txt

# Check whether origin has correct name
col_num=`awk -F'\t' '{print NF}' new_name.txt | head -n 1`
checker=`if (( $col_num==1 )); then echo 1; else echo 0; fi`

file_name=$(basename $VCF_FILES .vcf.gz)

if (( $checker )); then
    echo "Create symlink"
    ln -s $VCF_FILES ${file_name}.vcf.gz
else
    # Replace old name by new one
    echo "Replace by new name"
    bcftools reheader -s new_name.txt -o ${file_name}_correct_name.vcf.gz $VCF_FILES
fi

bcftools index ${file_name}_correct_name.vcf.gz

Concatenate VCF files

Concatenate autosome VCF files have same prefix (Array name/ lowpass coverage).

Code

set -ue

PREFIX=$1     #PREFIX="AMR-Axiom_JAPONICA"
VCF_FOLDER=$2    #VCF_FOLDER="/path/to/vcf_files"

# List VCF_FILES
VCF_FILES=$(ls ${VCF_FOLDER}/${PREFIX}_chr*.vcf.gz)

bcftools concat -Oz -o ${PREFIX}_concat.vcf.gz ${VCF_FILES}
bcftools index ${PREFIX}_concat.vcf.gz

Annotate VCF files

Code

set -eu

VCF_FILES=$1  # VCF_FILES="AMR-Axiom_UKB_WCSG_concat.vcf.gz"
REF_VCF=$2   # REF_VCF="00-All.vcf.gz"

file_name=$(basename $VCF_FILES .vcf.gz)

## Make sure the format chromosome names is numeric
bcftools view ${VCF_FILES} |                 sed 's/chr//g' |                 bgzip > tem.vcf.gz
bcftools index tem.vcf.gz

## Annotate the VCF with reference VCF
bcftools annotate -a ${REF_VCF} -c ID -o ${file_name}_anno.vcf.gz tem.vcf.gz
bcftools index ${file_name}_anno.vcf.gz

## Remove temporary files
rm tem.vcf.gz tem.vcf.gz.csi

Convert VCF files to BED files

Code

set -eu

VCF_FILES=$1  # VCF_FILES="AMR-Axiom_UKB_WCSG_anno.vcf.gz"

prefix=$(basename $VCF_FILES _anno.vcf.gz)

plink   --vcf ${VCF_FILES} \
        --make-bed  \
        --const-fid \
        --out ${prefix}       \
        --threads 2       \
        --memory 128000

QC VCF files

Code

PREFIX=$1  # e.g. AMR-Axiom_UKB_WCSG

plink     --bfile ${PREFIX}     \
          --maf 0.0001     \
          --hwe 1e-6     \
          --geno 0.01     \
          --mind 0.01     \
          --write-snplist     \
          --make-just-fam     \
          --memory 128000     \
          --out ${PREFIX}.QC

Deduplicate

Code

set -ue

PREFIX=$1  # e.g. AMR-Axiom_UKB_WCSG

## List duplicated records
Rscript LIST_NO_DUPLICATE.R ${PREFIX}.QC.snplist ${PREFIX}.nodup

## Deduplicate
plink         --bfile ${PREFIX}         \
              --threads 2         \
              --make-bed         \
              --keep ${PREFIX}.QC.fam         \
              --out ${PREFIX}.dedup         \
              --extract ${PREFIX}.nodup         \
              --memory 128000

Get raw PRS score

Code

TARGET_FILE=$1      # e.g. AMR-Axiom_UKB_WCSG.dedup
BASE_FILE=$2        # e.g. GIANT_BMI.QCed.gz
OUT_PREFIX=$3       # e.g. AMR-Axiom_UKB_WCSG_bmi
LD_TRUE=$4          # e.g. AMR-null.dedup

PRSice     --prsice /src/PRSice-2_v2.3.3/PRSice_linux     \
           --base ${BASE_FILE}     \
           --target ${TARGET_FILE}     \
           --ld ${LD_TRUE}     \
           --out ${OUT_PREFIX}     \
           --binary-target F     \
           --bar-levels 0.00000005,0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,0.2,0.3,0.5,1 \
           --fastscore     \
           --a1 A1     \
           --a2 A2     \
           --beta      \
           --bp BP     \
           --chr CHR     \
           --pvalue P     \
           --snp SNP     \
           --stat BETA     \
           --clump-kb 250kb     \
           --clump-p 1     \
           --clump-r2 0.1     \
           --ultra     \
           --no-regress     \
           --score sum     \
           --thread 1

Prepare percentile PRS scores

Code

# Load required packages
require(data.table)
require(scales)
require(ggplot2)

# -----------------------------
# Define input/output directories
# -----------------------------
input_wgs_dir <- '/workspaces/lps_paper/evaluation/downstream/data/raw_prs_scores/'    # WGS PRS files
input_array_dir <- '/workspaces/lps_paper/evaluation/downstream/data/raw_prs_scores/'  # Array PRS files
output_dir <- '/workspaces/lps_paper/evaluation/downstream/data/process_prs_scores/'   # Processed results

# -----------------------------
# Define lists
# -----------------------------
trait_list = c('BMI', 'HEIGHT', 'DIABETES', 'METABOLIC')
pop_list = c("AFR", "AMR", "EAS", "EUR", "SAS")
cutoff_list = c("Pt_5e-08", "Pt_1e-07", "Pt_1e-06", "Pt_1e-05", 
                "Pt_0.0001", "Pt_0.001", "Pt_0.01", "Pt_0.1", 
                "Pt_0.2", "Pt_0.3", "Pt_0.5", "Pt_1")

array_list = c('global-screening-array-v.3X',
               'Axiom_JAPONICAX', 'Axiom_UKB_WCSGX', 'Axiom_PMRAX',
               'Axiom_PMDAX', 'cytosnp-850k-v1.2X',
               'infinium-omni2.5.v1.5X', 'infinium-omni5-v1.2X',
               'LPS0.5X', 'LPS0.75X', 'LPS1.0X', 'LPS1.25X', 
               'LPS1.5X', 'LPS2.0X')

# ----------------------------------------------------------
# Function: Compute percentile difference for WGS vs Array
# ----------------------------------------------------------
get_percentile_dif <- function(wgs, array, pop, trait, cutoff ){
  cols = names(wgs)[-c(1,2)]  # Remove FID, IID
  array_path = paste0(input_array_dir, array, "_", pop, "_", trait, ".all_score")
  arr = fread(array_path)

  # Extract PRS columns
  wgs2 = wgs[, ..cols]
  arr2 = arr[, ..cols]

  # Extract PRS values for the selected cutoff
  x = wgs2[[cutoff]]
  y = arr2[[cutoff]]

  # Compute ECDFs and convert raw scores to percentiles
  x1 = ecdf(x)
  y1 = ecdf(y)
  x = x1(x)  # WGS percentiles
  y = y1(y)  # Array percentiles

  # Calculate absolute percentile difference in %
  pct_dif = abs(x - y) * 100

  # Optional: array percentile using a specific column (3rd)
  arr_percentile <- ecdf(arr[[3]])
  y = arr_percentile(arr[[3]])

  # Return results as data frame
  tem = data.frame(wgs_pct = x, arr_pct = y, pct_dif = pct_dif, trait = trait)
  tem$array = array
  return(tem)
}

# ----------------------------------------------------------
# Function: Compute percentile differences for all arrays for one trait
# ----------------------------------------------------------
get_percentile_trait <- function(pop, array_list, trait, cutoff){
  # Load WGS PRS file for given population and trait
  wgs_path = paste0(input_wgs_dir, "WGS_", pop, "_", trait, ".all_score")
  wgs = fread(wgs_path)

  res = list()
  for(array in array_list){
    res[[array]] = get_percentile_dif(wgs, array, pop, trait, cutoff)
  }

  # Combine into single data frame
  df = as.data.frame(do.call(rbind, res))
  df$pop = pop
  return(df)
}

# ----------------------------------------------------------
# Function: Compute percentile differences for all traits in one population
# ----------------------------------------------------------
get_percentile_pop <- function(pop, array_list, trait_list, cutoff){
  res = list()
  for(trait in trait_list){
    res[[trait]] = get_percentile_trait(pop, array_list, trait, cutoff)
  }
  df = as.data.frame(do.call(rbind, res))
  df$pop = pop
  return(df)
}

# ----------------------------------------------------------
# Function: Compute percentile differences for all populations
# ----------------------------------------------------------
get_percentile_all <- function(pop_list, array_list, cutoff){
  res = list()
  for(pop in pop_list){
    res[[pop]] = get_percentile_pop(pop, array_list, trait_list, cutoff)
  }
  df = as.data.frame(do.call(rbind, res))
  return(df)
}

# ----------------------------------------------------------
# Main loop: Process all cutoffs and save results
# ----------------------------------------------------------
for(c in cutoff_list){
  message("Processing cutoff: ", c)
  x = get_percentile_all(pop_list, array_list, c)
  out_path = paste0(output_dir, c, '.csv')
  fwrite(x, out_path, sep = ',', row.names = FALSE)
}

  1. Shing Wan Choi, Timothy Shin-Heng Mak, and Paul F O’Reilly. Tutorial: a guide to performing polygenic risk score analyses. Nature protocols, 15(9):2759–2772, 2020.