vcfpp: a single C++ file for manipulating VCF

This project introduces a single C++ file as interface to the basic htslib, which can be easily included in a C++ program for scripting high-performance genomic analyses.

Features:

single file to be easily included and compiled
easy and safe API to use. no worry about free memory.
has the full functionalities of the htslib, eg. supports of compressed VCF/BCF and URL link as filename.
compatible with C++11 and later

Installation

Examples

Calculate the heterozygosity rate

Infer population structure with PCA

Build PBWT index from haplotype data

Installation

download the released vcfpp.h and include it in your program.
make sure you have https://github.com/samtools/htslib installed on your system and the it is in your environment.

Examples

You can copy paste the example code into a example.cpp file and compile it by g++ example.cpp -std=c++11 -O2 -Wall -I. -lhts -lz -lm -lbz2 -llzma -lcurl

Calculate the heterozygosity rate

Let’s count the number of heterozygous sites for each sample in all records. The core is just 10 lines.

#include "vcfpp.h"
using namespace std;
using namespace vcfpp;

int main(int argc, char* argv[])
{
    // fetch data from 1000 genomes server
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header); // construct a variant record
    vector<char> gt; // genotype can be of bool, char or int type
    vector<int> hetsum(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < gt.size()/2 ; i++) { // for diploid
            hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i +1]);
        }
    }
    for (auto i : hetsum) { cout << i << endl; }
    return 0;
}

If you don’t bother using Eigen, another header only high performance linear algebra library, here is the more expressive way. Also, we’ll need the library to run PCA with VCF in another example.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header);
    vector<int> gt;
    ArrayXi hetsum = ArrayXi::Zero(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        Map<const ArrayXXi> m(gt.data(), v.nploidy , gt.size() / v.nploidy); // read only
        hetsum += (m.row(0) - m.row(1)).abs(); // for diploid
    }
    cout << hetsum << endl;
    return 0;
}

Infer population structure with PCA

We need the Eigen library to deal with linear algebra. To infer population structure using PCA, we first construct the genotype matrix (0,1,2) and standardize it under HWE assumption. Then we perform SVD on the standardized matrix to get the PCs.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107", "chr22:19000000-20000000");
    BcfRecord v(vcf.header);
    int nsnps = 0, nsamples = vcf.nsamples;
    vector<bool> gt;
    vector<float> mat, gts(nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < nsamples; i++) {
            gts[i] = gt[2 * i] + gt[2 * i + 1];
        }
        mat.insert(mat.end(), gts.begin(), gts.end());
        nsnps++;
    }
    Map<MatrixXf> M(mat.data(), nsamples, nsnps); // dim is nsamples x nsnsp
    ArrayXf hwe = M.colwise().mean().array();
    hwe = (hwe * (1 - hwe/2)).sqrt() ;
    hwe = (hwe > 1e-9).select(hwe, 1); // in case denominator is smaller than 1e-9
    M = (-M).rowwise() + M.colwise().mean(); // centering by subtracting the mean
    M = (M.array().rowwise() / hwe.transpose()).matrix(); // standardize the matrix
    JacobiSVD<MatrixXf> svd(M, Eigen::ComputeThinU | Eigen::ComputeThinV);
    cout << svd.matrixU.leftCols(10) << endl; // save or print out top 10 PCs
    cout << svd.singularValues().array().square() / nsnps << endl; // print out eigenvalues
    return 0;
}

Build PBWT index from haplotype data

Here is a clean example of building Durbin’s PBWT structure using native C++. Depending on your project’s needs, you should be able to modify it to return other useful matrix, such as divergence matrix (d) and FM-index matrix (u, v).

void pbwt() {
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "-", "chr22:16050075-16050655");
    BcfRecord v(vcf.header);
    vector<bool> gt;
    vector<vector<bool>> x;
    int nsnps = 0, nsamples = vcf.nsamples;
    while (vcf.getNextVariant(v))
    {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);x.push_back(gt);
        nsnps++;
    }
    int N = nsnps, M = nsamples * 2;
    vector<vector<int>> a(N, vector<int>(M)); // this is the index matrix to be retured
    vector<int> a0(M), a1(M), d0(M), d1(M), d(M); // note: to output divergence matrix, make d as two dimensional N x M.
    int i = 0, j = 0, u_ = 0, v_ = 0, p = 0, q = 0, d_, a_;
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            d_ = j > 0 ? d[i] : 0;
            a_ = j > 0 ? a[j-1][i] : i;
            p = max(p, d_); q = max(q, d_);
            if (x[j][a_]) {
                a1[v_] = a_; d1[v_] = q;
                v_++; q = 0;
            } else {
                a0[u_] = a_; d0[u_] = p;
                u_++; p = 0;
            }
        }
        for (i = 0; i < M; i++) {
            if (i < u_) {
                a[j][i] = a0[i];
                d[i] = d0[i];
            } else {
                a[j][i] = a1[i - u_];
                d[i] = d1[i - u_];
            }
        }
    } // return a
}

fuzzylife / vcfpp Goto Github PK

vcfpp's Introduction

vcfpp: a single C++ file for manipulating VCF

Table of Contents

Installation

Examples

Calculate the heterozygosity rate

Infer population structure with PCA

Build PBWT index from haplotype data

vcfpp's People

Contributors

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs