Skip to content

PRS processing

Requirements

  • Ubuntu 22.04 (8 CPUs, 32 GB)
  • bcftools (version==1.13)
  • plink v1.90
  • PRSice-2 v2.3.3

Input data

  • restructed lpWGS VCFs
  • restructed SNP-array VCFs
  • True VCFs
  • Base sumstats

The PRS processing scripts were developed with reference to the tutorial provided by Choi et al.1 , which served as a foundational guide.

PRS processing workflow

Correct sample name

Ensure that sample names do not contain underscores, as these may be introduced during the merging of imputed VCF files. In such cases, the filename used during merging may be incorporated into the sample name to maintain uniqueness across datasets.

Code

set -ue

VCF_FILES=$1

# Get correct name
bcftools query -l $VCF_FILES > sample_name.txt
sed -E 's/^(([^_]+)_.*)/\1\t\2/g' sample_name.txt > new_name.txt

# Check whether origin has correct name
col_num=`awk -F'\t' '{print NF}' new_name.txt | head -n 1`
checker=`if (( $col_num==1 )); then echo 1; else echo 0; fi`

file_name=$(basename $VCF_FILES .vcf.gz)

if (( $checker )); then
    echo "Create symlink"
    ln -s $VCF_FILES ${file_name}.vcf.gz
else
    # Replace old name by new one
    echo "Replace by new name"
    bcftools reheader -s new_name.txt -o ${file_name}_correct_name.vcf.gz $VCF_FILES
fi

bcftools index ${file_name}_correct_name.vcf.gz

Concatenate VCF files

Concatenate autosome VCF files have same prefix (Array name/ lowpass coverage).

Code

set -ue

PREFIX=$1     #PREFIX="AMR-Axiom_JAPONICA"
VCF_FOLDER=$2    #VCF_FOLDER="/path/to/vcf_files"

# List VCF_FILES
VCF_FILES=$(ls ${VCF_FOLDER}/${PREFIX}_chr*.vcf.gz)

bcftools concat -Oz -o ${PREFIX}_concat.vcf.gz ${VCF_FILES}
bcftools index ${PREFIX}_concat.vcf.gz

Annotate VCF files

Code

set -eu

VCF_FILES=$1  # VCF_FILES="AMR-Axiom_UKB_WCSG_concat.vcf.gz"
REF_VCF=$2   # REF_VCF="00-All.vcf.gz"

file_name=$(basename $VCF_FILES .vcf.gz)

## Make sure the format chromosome names is numeric
bcftools view ${VCF_FILES} |                 sed 's/chr//g' |                 bgzip > tem.vcf.gz
bcftools index tem.vcf.gz

## Annotate the VCF with reference VCF
bcftools annotate -a ${REF_VCF} -c ID -o ${file_name}_anno.vcf.gz tem.vcf.gz
bcftools index ${file_name}_anno.vcf.gz

## Remove temporary files
rm tem.vcf.gz tem.vcf.gz.csi

Convert VCF files to BED files

Code

set -eu

VCF_FILES=$1  # VCF_FILES="AMR-Axiom_UKB_WCSG_anno.vcf.gz"

prefix=$(basename $VCF_FILES _anno.vcf.gz)

plink   --vcf ${VCF_FILES} \
        --make-bed  \
        --const-fid \
        --out ${prefix}       \
        --threads 2       \
        --memory 128000

QC VCF files

Code

PREFIX=$1  # e.g. AMR-Axiom_UKB_WCSG

plink     --bfile ${PREFIX}     \
          --maf 0.0001     \
          --hwe 1e-6     \
          --geno 0.01     \
          --mind 0.01     \
          --write-snplist     \
          --make-just-fam     \
          --memory 128000     \
          --out ${PREFIX}.QC

Deduplicate

Code

set -ue

PREFIX=$1  # e.g. AMR-Axiom_UKB_WCSG

## List duplicated records
Rscript LIST_NO_DUPLICATE.R ${PREFIX}.QC.snplist ${PREFIX}.nodup

## Deduplicate
plink         --bfile ${PREFIX}         \
              --threads 2         \
              --make-bed         \
              --keep ${PREFIX}.QC.fam         \
              --out ${PREFIX}.dedup         \
              --extract ${PREFIX}.nodup         \
              --memory 128000

Get raw PRS score

Code

TARGET_FILE=$1      # e.g. AMR-Axiom_UKB_WCSG.dedup
BASE_FILE=$2        # e.g. GIANT_BMI.QCed.gz
OUT_PREFIX=$3       # e.g. AMR-Axiom_UKB_WCSG_bmi
LD_TRUE=$4          # e.g. AMR-null.dedup

PRSice     --prsice /src/PRSice-2_v2.3.3/PRSice_linux     \
           --base ${BASE_FILE}     \
           --target ${TARGET_FILE}     \
           --ld ${LD_TRUE}     \
           --out ${OUT_PREFIX}     \
           --binary-target F     \
           --bar-levels 0.00000005,0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,0.2,0.3,0.5,1 \
           --fastscore     \
           --a1 A1     \
           --a2 A2     \
           --beta      \
           --bp BP     \
           --chr CHR     \
           --pvalue P     \
           --snp SNP     \
           --stat BETA     \
           --clump-kb 250kb     \
           --clump-p 1     \
           --clump-r2 0.1     \
           --ultra     \
           --no-regress     \
           --score sum     \
           --thread 1

Prepare percentile PRS scores

Code

...

  1. Shing Wan Choi, Timothy Shin-Heng Mak, and Paul F O’Reilly. Tutorial: a guide to performing polygenic risk score analyses. Nature protocols, 15(9):2759–2772, 2020.