This is a short version of the tutorial using the ZFZR demo data set.
For more details check out the appropriate section of the detailed tutorial and the
documentation of the functions (?function_name)
library(vtamR)
I suppose all third party tools are in your PATH. If not, see the detailed tutorial on how to set the PATH.
See Input data for more details.
outdir <- "vtamR_demo_out/zfzr_plate1"
fastq_dir <- system.file("extdata/demo/fastq", package = "vtamR")
fastqinfo <- system.file("extdata/demo/fastqinfo_zfzr_plate1.csv", package = "vtamR")
mock_ncbi_fasta <- system.file("extdata/demo/mock_ncbi.fasta", package = "vtamR")
asv_list <- system.file("extdata/demo/ASV_list_with_IDs.csv", package = "vtamR") # all ASV of MFZR
taxonomy <- system.file("extdata/db_test/taxonomy_reduced.tsv", package = "vtamR")
blast_db <- system.file("extdata/db_test", package = "vtamR")
blast_db <- file.path(blast_db, "COInr_reduced")
See Merge Fastq Pairs and Quality Filter for more details.
merged_dir <- file.path(outdir, "1_merged")
fastainfo_df <- merge_fastq_pairs(
fastqinfo=fastqinfo,
fastq_dir=fastq_dir,
outdir=merged_dir,
fastq_maxee=1,
fastq_maxns=0,
fastq_allowmergestagger=T
)
See Randomly Select Sequences for more details.
randomseq_dir <- file.path(outdir, "2_random_seq")
fastainfo_df <- subsample_fasta(
fastainfo = fastainfo_df,
fasta_dir = merged_dir,
outdir = randomseq_dir,
n = 40000
)
#> Warning: vtamR_demo_out/zfzr_plate1/1_merged/zfzr_3_fw.fasta contains 39603 sequences.
#> The input file is copied to output.
See Demultiplex and Trim for more details.
demultiplexed_dir <- file.path(outdir, "3_demultiplexed")
sampleinfo_df <- demultiplex_and_trim(
fastainfo=fastainfo_df,
fasta_dir=randomseq_dir,
outdir=demultiplexed_dir,
check_reverse=TRUE,
cutadapt_minimum_length = 150,
cutadapt_maximum_length = 165
)
See Dereplicate for more details.
outfile <- file.path(outdir, "4_filter", "1_before_filter.csv")
updated_asv_list <- file.path(outdir, "ASV_list_with_IDs.csv")
read_count_df <- dereplicate(
sampleinfo_df,
dir=demultiplexed_dir,
outfile=outfile,
input_asv_list=asv_list,
output_asv_list=updated_asv_list
)
See Filtering for more details.
See Initialize Statistics for more details.
stat_df <- data.frame(
parameters = character(),
asv_count = integer(),
read_count = integer(),
sample_count = integer(),
sample_replicate_count = integer()
)
stat_df <- get_stat(read_count_df, stat_df, stage = "input", params = NA)
See Denoising with SWARM for more details.
split_clusters <- TRUE
by_sample <- TRUE
outfile <- file.path(outdir, "4_filter", "2_swarm_denoise.csv")
read_count_df <- denoise_by_swarm(
read_count_df,
outfile=outfile,
by_sample=by_sample,
split_clusters=split_clusters
)
param <- paste("by_sample:", by_sample, "split_clusters:", split_clusters)
stat_df <- get_stat(read_count_df, stat_df, stage="swarm_denoise", params=param)
See Filter out ASVs with Low Total Read Count for more details.
global_read_count_cutoff = 2
outfile <- file.path(outdir, "4_filter", "3_filter_asv_global.csv")
read_count_df <- filter_asv_global(
read_count_df,
outfile=outfile,
cutoff=global_read_count_cutoff
)
param <- paste("global_read_count_cutoff:", global_read_count_cutoff)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_asv_global", params=param)
See Filter ASVs Based on Read Length for more details.
outfile <- file.path(outdir, "4_filter", "4_filter_indel.csv")
read_count_df <- filter_indel(
read_count_df,
outfile=outfile
)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_indel", params=NA)
See Filter ASVs with Stop Codons for more details.
outfile <- file.path(outdir, "4_filter", "5_filter_stop_codon.csv")
read_count_df <- filter_stop_codon(
read_count_df,
outfile=outfile
)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_stop_codon", params=NA)
See Filter Potential Contaminant ASVs for more details.
outfile <- file.path(outdir, "4_filter", "6_filter_contaminant.csv")
conta_file <- file.path(outdir, "4_filter", "potential_contaminants.csv")
read_count_df <- filter_contaminant(
read_count_df,
sampleinfo=sampleinfo_df,
conta_file=conta_file,
outfile=outfile
)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_contaminant", params=NA)
See Filter Chimeric ASVs for more details.
outfile <- file.path(outdir, "4_filter", "7_filter_chimera.csv")
read_count_df <- filter_chimera(
read_count_df,
outfile=outfile
)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_chimera", params=NA)
See Filter Aberrant Replicates for more details.
outfile <- file.path(outdir, "4_filter", "8_filter_replicate.csv")
renkonen_distance_quantile <- 0.9
read_count_df <- filter_replicate(
read_count_df,
renkonen_distance_quantile = renkonen_distance_quantile,
outfile = outfile
)
#> [1] "The cutoff value for Renkonen distances is 0.43204653622422"
param <- paste("renkonen_distance_quantile:", renkonen_distance_quantile)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_replicate", params=param)
mock_composition FileSee Create
mock_composition File for more details.
outdir_mock <- file.path(outdir, "mock_composition")
mock_template <- match_variants_to_mock_species(
read_count=read_count_df,
fas=mock_ncbi_fasta,
taxonomy=taxonomy,
sampleinfo = sampleinfo_df,
outdir= outdir_mock
)
mock_composition <- file.path(outdir_mock, "mock_composition_template_to_check.csv")
mock_composition_df <- read.csv(mock_composition)
Note:
The mock_composition_template_to_check.csv file contains
only 5 of the 6 expected ASVs. Since the NCBI reference sequences were
correctly chosen, this indicates that the ZFZP primer pair did not
amplify Phoxinus phoxinus.
Because this sequence is absent from the
mock_composition, it will not be counted as a false
negative when evaluating performance. This is an artefact, and it should
be kept in mind that it is in fact a false negative.
See Create
mock_composition File for more details.
See Filter Low-Abundance Occurrences within Samples for more details.
Suggest parameter value
outfile = file.path(outdir, "4_filter", "suggest_parameters", "sample_cutoff.csv")
suggest_sample_cutoff_df <- suggest_sample_cutoff(
read_count=read_count_df,
mock_composition=mock_composition,
outfile=outfile
)
head(suggest_sample_cutoff_df)
#> sample replicate action asv_id sample_cutoff read_count
#> 1 tpos1 3 keep 9 0.0001 2
#> 2 tpos1 1 keep 9 0.0004 14
#> 3 tpos1 2 keep 9 0.0004 10
#> 4 tpos1 2 keep 10 0.0017 40
#> 5 tpos1 1 keep 10 0.0024 69
#> 6 tpos1 3 keep 10 0.0024 46
#> read_count_sample_replicate
#> 1 18727
#> 2 28405
#> 3 22864
#> 4 22864
#> 5 28405
#> 6 18727
#> asv
#> 1 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 2 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 3 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 4 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#> 5 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#> 6 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#> taxon
#> 1 Hydropsyche pellucidula
#> 2 Hydropsyche pellucidula
#> 3 Hydropsyche pellucidula
#> 4 Synorthocladius semivirens
#> 5 Synorthocladius semivirens
#> 6 Synorthocladius semivirens
Apply filter
sample_cutoff <- 0.0001
outfile <- file.path(outdir, "4_filter", "9_filter_occurrence_sample.csv")
read_count_df <- filter_occurrence_sample(
read_count_df,
cutoff=sample_cutoff,
outfile=outfile)
param <- paste("cutoff:", sample_cutoff)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_occurrence_sample", params=param)
See Filter Occurrences by Replicate Consistency for more details.
min_replicate_number <- 2
outfile <- file.path(outdir, "4_filter", "10_filter_min_replicate.csv")
read_count_df <- filter_min_replicate(
read_count_df,
cutoff=min_replicate_number,
outfile=outfile
)
param <- paste("cutoff:", min_replicate_number)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_min_replicate", params=param)
See Filter
Low-Abundance Occurrences (filter_occurrence_variant and
filter_occurrence_read_count) for more
details.
Suggest parameter values
outdir_suggest = file.path(outdir, "4_filter", "suggest_parameters")
suggest_variant_readcount_cutoffs_df <- suggest_variant_readcount_cutoffs(
read_count_df,
mock_composition = mock_composition,
sampleinfo = sampleinfo_df,
outdir= outdir_suggest,
min_replicate_number=2
)
head(suggest_variant_readcount_cutoffs_df)
#> read_count_cutoff variant_cutoff FN TP FP
#> 1 10 0.001 0 5 2
#> 2 10 0.002 0 5 2
#> 3 10 0.003 0 5 2
#> 4 10 0.004 0 5 2
#> 5 10 0.005 0 5 2
#> 6 10 0.006 0 5 2
Filter occurrences by variant
variant_cutoff = 0.001
outfile <- file.path(outdir, "4_filter", "11_filter_occurrence_variant.csv")
read_count_df_variant <- filter_occurrence_variant(read_count_df, cutoff=variant_cutoff, outfile=outfile)
Filter occurrences by read count
read_count_cutoff <- 10
outfile <- file.path(outdir, "4_filter", "12_filter_occurrence_read_count.csv")
read_count_df_rc <- filter_occurrence_read_count(
read_count_df,
cutoff=read_count_cutoff,
outfile=outfile
)
Pool results
outfile <- file.path(outdir, "4_filter", "13_pool_filters.csv")
read_count_df <- pool_filters(
read_count_df_rc,
read_count_df_variant,
outfile=outfile
)
rm(read_count_df_rc)
rm(read_count_df_variant)
See Filter Occurrences by Replicate Consistency for more details.
min_replicate_number <- 2
outfile <- file.path(outdir, "4_filter", "14_filter_min_replicate.csv")
read_count_df <- filter_min_replicate(
read_count_df,
cutoff=min_replicate_number,
outfile=outfile
)
See Pool replicates for more details.
outfile <- file.path(outdir, "4_filter", "15_pool_replicates.csv")
read_count_sample_df <- pool_replicates(
read_count_df,
outfile = outfile
)
See Performance Metrics for more details.
false_negatives <- file.path(outdir, "4_filter", "false_negatives.csv")
performance_metrics <- file.path(outdir, "4_filter", "performance_metrics.csv")
known_occurrences <- file.path(outdir, "4_filter", "known_occurrences.csv")
results <- classify_control_occurrences(
read_count_sample_df,
sampleinfo=sampleinfo_df,
mock_composition=mock_composition,
known_occurrences=known_occurrences,
false_negatives=false_negatives,
performance_metrics=performance_metrics
)
head(read.csv(performance_metrics))
#> TP FP FN Precision Sensitivity
#> 1 5 2 0 0.7142857 1
Note: Keep in mind that one expected sequence in the
mock is not present in the mock_composition. Therefore, the
true number of false negatives is 1, and the sensitivity is:
TP / (TP + FN) = 5 / (5 + 1) = 0.833
See Print Summary Statistics for more details.
write.csv(stat_df, file = file.path(outdir, "summary.csv"))
See Taxonomic Assignment for more details.
outfile <- file.path(outdir, "4_filter", "assign_taxonomy_ltg.csv")
asv_tax <- assign_taxonomy_ltg(
asv=read_count_sample_df,
taxonomy=taxonomy,
blast_db=blast_db
)
See Clustering (mOTUs) for more details.
outfile <- file.path(outdir, "5_cluster", "16_mOTU_vsearch.csv")
read_count_samples_df_ClusterSize <- cluster_asv(
read_count_sample_df,
method = "swarm",
swarm_d = 7,
outfile=outfile
)
See Output table for more details.
outfile=file.path(outdir, "motu_table_taxa.csv")
asv_table_df <- write_asv_table(
read_count_sample_df,
outfile=outfile,
asv_tax=asv_tax,
sampleinfo=sampleinfo_df
)