60 个版本 (27 个破坏性版本)
新 0.28.0 | 2024年8月19日 |
---|---|
0.26.1 | 2024年8月7日 |
0.25.7 | 2024年6月18日 |
0.25.4 | 2024年3月8日 |
0.6.2 | 2023年7月17日 |
#29 in 生物学
每月下载量:746
用于 varfish-server-worker
615KB
13K SLoC
梅哈里
梅哈里是一个用于用变种效应/后果注释 VCF 文件的软件包。该程序使用 hgvs-rs 将基因组变种投影到转录本和蛋白质,从而具有高质量的预测。
其他提供变种效应/后果预测的流行工具包括
梅哈里提供的预测旨在与 VariantValidator 相似,VariantValidator 是 HGVS 变体描述的黄金标准。此外,它用 Rust 编程语言编写,可以用作用户 Rust 软件的库。
支持的序列变种频率数据库
梅哈里可以导入公共序列变种频率数据库。支持的集合在 GRCh37 和 GRCh38 之间略有不同。
GRCh37
- gnomAD r2.1.1 Exomes
gnomad.exomes.r2.1.1.sites.vcf.bgz
- gnomAD r2.1.1 Genomes
gnomad.genomes.r2.1.1.sites.vcf.bgz
- gnomAD v3.1 mtDNA
gnomad.genomes.v3.1.sites.chrM.vcf.bgz
- HelixMTdb
HelixMTdb_20200327.tsv
GRCh38
- gnomAD r2.1.1 外显子组提升覆盖
gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
- gnomAD v3.1 基因组
gnomad.genomes.v3.1.2.sites.$CHROM.vcf.bgz
- gnomAD v3.1 mtDNA
gnomad.genomes.v3.1.sites.chrM.vcf.bgz
- HelixMTdb
HelixMTdb_20200327.tsv
内部笔记
rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch38 --path-helix-mtdb ~/Downloads/HelixMTdb_20200327.vcf.gz --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf
rm -rf /tmp/out ; cargo run -- db create seqvar-freqs --path-output-db /tmp/out --genome-release grch37 --path-gnomad-mtdna ~/Downloads/gnomad.genomes.v3.1.sites.chrM.vcf.bgz --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf --path-gnomad-genomes-xy tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf --path-gnomad-exomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf --path-gnomad-genomes-auto tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2
prepare()
{
in=$1
out=$2
zcat $in \
| head -n 5000 \
| grep ^# \
> $out
zcat $in \
| grep -v ^# \
| head -n 3 \
>> $out
}
base=/data/sshfs/data/gpfs-1/groups/cubi/work/projects/2021-07-20_varfish-db-downloader-holtgrewe/varfish-db-downloader/
mkdir -p tests/data/db/create/seqvar_freqs/{12,xy}-{37,38}
## 37 exomes
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.exomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrX.vcf
prepare \
$base/GRCh37/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.exomes.r2.1.1.sites.chrY.vcf
## 37 genomes
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-37/gnomad.genomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh37/gnomAD_genomes/r2.1.1/download/gnomad.genomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-37/gnomad.genomes.r2.1.1.sites.chrX.vcf
## 38 exomes
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr1.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.exomes.r2.1.1.sites.chr2.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrX.vcf
prepare \
$base/GRCh38/gnomAD_exomes/r2.1.1/download/gnomad.exomes.r2.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.exomes.r2.1.1.sites.chrY.vcf
## 38 genomes
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr1.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr1.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chr2.vcf.bgz \
tests/data/db/create/seqvar_freqs/12-38/gnomad.genomes.r3.1.1.sites.chr2.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrX.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrX.vcf
prepare \
$base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \
tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf
构建 tx 数据库
cd hgvs-rs-data
seqrepo --root-directory seqrepo-data/master init
mkdir -p mirror/ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot
cd !$
wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed
parallel -j 16 'wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/{}' ::: $(cut -f 2 human.files.installed | grep fna)
cd -
mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna
cd !$
wget https://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/ncrna
cd !$
wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/
cd !$
wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz
cd -
mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/
cd !$
wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh37.ncrna.fa.gz
cd -
seqrepo --root-directory seqrepo-data/master load -n NCBI $(find mirror/ftp.ncbi.nih.gov -name '*.fna.gz' | sort)
seqrepo --root-directory seqrepo-data/master load -n ENSEMBL $(find mirror/ftp.ensembl.org -name '*.fa.gz' | sort)
cd ../mehari
cargo run --release -- \
-v \
db create txs \
--path-out /tmp/txs-out.bin.zst \
--path-lable-tsv PATH_TO_MANE_LABEL.tsv \
--path-cdot-json ../cdot-0.2.21.ensembl.grch37_grch38.json.gz \
--path-cdot-json ../cdot-0.2.21.refseq.grch37_grch38.json.gz \
--path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master
开发设置
您需要 protoc 的最新版本,例如。
# bash utils/install-protoc.sh
# export PATH=$PATH:$HOME/.local/share/protoc/bin
依赖项
~95MB
~2M SLoC