This project introduces a single C++ file as interface to the basic htslib
, which can be easily included in a C++ program
for scripting high-performance genomic analyses.
Features:
- single file to be easily included and compiled
- easy and safe API to use. no worry about free memory.
- has the full functionalities of the
htslib
, eg. supports of compressed VCF/BCF and URL link as filename. - compatible with C++11 and later
- download the released vcfpp.h and include it in your program.
- make sure you have https://github.com/samtools/htslib installed on your system and the it is in your environment.
You can copy paste the example code into a example.cpp
file and compile it by g++ example.cpp -std=c++11 -O2 -Wall -I. -lhts -lz -lm -lbz2 -llzma -lcurl
Let’s count the number of heterozygous sites for each sample in all records. The core is just 10 lines.
#include "vcfpp.h"
using namespace std;
using namespace vcfpp;
int main(int argc, char* argv[])
{
// fetch data from 1000 genomes server
BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
BcfRecord v(vcf.header); // construct a variant record
vector<char> gt; // genotype can be of bool, char or int type
vector<int> hetsum(vcf.nsamples);
while (vcf.getNextVariant(v)) {
if (!v.isSNP()) continue; // skip other type of variants
v.getGenotypes(gt);
for (int i = 0; i < gt.size()/2 ; i++) { // for diploid
hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i +1]);
}
}
for (auto i : hetsum) { cout << i << endl; }
return 0;
}
If you don’t bother using Eigen, another header only high performance linear algebra library, here is the more expressive way. Also, we’ll need the library to run PCA with VCF in another example.
#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;
int main(int argc, char* argv[])
{
BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
BcfRecord v(vcf.header);
vector<int> gt;
ArrayXi hetsum = ArrayXi::Zero(vcf.nsamples);
while (vcf.getNextVariant(v)) {
if (!v.isSNP()) continue; // skip other type of variants
v.getGenotypes(gt);
Map<const ArrayXXi> m(gt.data(), v.nploidy , gt.size() / v.nploidy); // read only
hetsum += (m.row(0) - m.row(1)).abs(); // for diploid
}
cout << hetsum << endl;
return 0;
}
We need the Eigen library to deal with linear algebra. To infer population structure using PCA, we first construct the genotype matrix (0,1,2) and standardize it under HWE assumption. Then we perform SVD on the standardized matrix to get the PCs.
#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;
int main(int argc, char* argv[])
{
BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107", "chr22:19000000-20000000");
BcfRecord v(vcf.header);
int nsnps = 0, nsamples = vcf.nsamples;
vector<bool> gt;
vector<float> mat, gts(nsamples);
while (vcf.getNextVariant(v)) {
if (!v.isSNP()) continue; // skip other type of variants
v.getGenotypes(gt);
for (int i = 0; i < nsamples; i++) {
gts[i] = gt[2 * i] + gt[2 * i + 1];
}
mat.insert(mat.end(), gts.begin(), gts.end());
nsnps++;
}
Map<MatrixXf> M(mat.data(), nsamples, nsnps); // dim is nsamples x nsnsp
ArrayXf hwe = M.colwise().mean().array();
hwe = (hwe * (1 - hwe/2)).sqrt() ;
hwe = (hwe > 1e-9).select(hwe, 1); // in case denominator is smaller than 1e-9
M = (-M).rowwise() + M.colwise().mean(); // centering by subtracting the mean
M = (M.array().rowwise() / hwe.transpose()).matrix(); // standardize the matrix
JacobiSVD<MatrixXf> svd(M, Eigen::ComputeThinU | Eigen::ComputeThinV);
cout << svd.matrixU.leftCols(10) << endl; // save or print out top 10 PCs
cout << svd.singularValues().array().square() / nsnps << endl; // print out eigenvalues
return 0;
}
Here is a clean example of building Durbin’s PBWT structure using native C++. Depending on your project’s needs, you should be able to modify it to return other useful matrix, such as divergence matrix (d
) and FM-index matrix (u
, v
).
void pbwt() {
BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "-", "chr22:16050075-16050655");
BcfRecord v(vcf.header);
vector<bool> gt;
vector<vector<bool>> x;
int nsnps = 0, nsamples = vcf.nsamples;
while (vcf.getNextVariant(v))
{
if (!v.isSNP()) continue; // skip other type of variants
v.getGenotypes(gt);x.push_back(gt);
nsnps++;
}
int N = nsnps, M = nsamples * 2;
vector<vector<int>> a(N, vector<int>(M)); // this is the index matrix to be retured
vector<int> a0(M), a1(M), d0(M), d1(M), d(M); // note: to output divergence matrix, make d as two dimensional N x M.
int i = 0, j = 0, u_ = 0, v_ = 0, p = 0, q = 0, d_, a_;
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
d_ = j > 0 ? d[i] : 0;
a_ = j > 0 ? a[j-1][i] : i;
p = max(p, d_); q = max(q, d_);
if (x[j][a_]) {
a1[v_] = a_; d1[v_] = q;
v_++; q = 0;
} else {
a0[u_] = a_; d0[u_] = p;
u_++; p = 0;
}
}
for (i = 0; i < M; i++) {
if (i < u_) {
a[j][i] = a0[i];
d[i] = d0[i];
} else {
a[j][i] = a1[i - u_];
d[i] = d1[i - u_];
}
}
} // return a
}