Short Tutorial

This is a short version of the tutorial using the ZFZR demo data set. For more details check out the appropriate section of the detailed tutorial and the documentation of the functions (?function_name)

Set up

Load library

library(vtamR)

Set paths (if needed)

I suppose all third party tools are in your PATH. If not, see the detailed tutorial on how to set the PATH.

Input data

See Input data for more details.

Demo data

outdir <- "vtamR_demo_out/zfzr_plate1"

fastq_dir <- system.file("extdata/demo/fastq", package = "vtamR")
fastqinfo <- system.file("extdata/demo/fastqinfo_zfzr_plate1.csv", package = "vtamR")
mock_ncbi_fasta <- system.file("extdata/demo/mock_ncbi.fasta", package = "vtamR")
asv_list <- system.file("extdata/demo/ASV_list_with_IDs.csv", package = "vtamR") # all ASV of MFZR

Database

taxonomy <- system.file("extdata/db_test/taxonomy_reduced.tsv", package = "vtamR")
blast_db <- system.file("extdata/db_test", package = "vtamR")
blast_db <- file.path(blast_db, "COInr_reduced")

Pre-processing

Merge Fastq Pairs

See Merge Fastq Pairs and Quality Filter for more details.

merged_dir <- file.path(outdir, "1_merged")

fastainfo_df <- merge_fastq_pairs(
  fastqinfo=fastqinfo,
  fastq_dir=fastq_dir,
  outdir=merged_dir,
  fastq_maxee=1,
  fastq_maxns=0,
  fastq_allowmergestagger=T
)

Subsampling

See Randomly Select Sequences for more details.

randomseq_dir <- file.path(outdir, "2_random_seq")

fastainfo_df <- subsample_fasta(
  fastainfo = fastainfo_df,
  fasta_dir = merged_dir,
  outdir = randomseq_dir,
  n = 40000
)
#> Warning: vtamR_demo_out/zfzr_plate1/1_merged/zfzr_3_fw.fasta contains 39603 sequences.
#> The input file is copied to output.

Demultiplex

See Demultiplex and Trim for more details.

demultiplexed_dir <- file.path(outdir, "3_demultiplexed")

sampleinfo_df <- demultiplex_and_trim(
  fastainfo=fastainfo_df,
  fasta_dir=randomseq_dir,
  outdir=demultiplexed_dir,
  check_reverse=TRUE,
  cutadapt_minimum_length = 150,
  cutadapt_maximum_length = 165
)

Dereplicate

See Dereplicate for more details.

outfile <- file.path(outdir, "4_filter", "1_before_filter.csv")
updated_asv_list <- file.path(outdir, "ASV_list_with_IDs.csv")

read_count_df <- dereplicate(
  sampleinfo_df,
  dir=demultiplexed_dir,
  outfile=outfile,
  input_asv_list=asv_list,
  output_asv_list=updated_asv_list
)

Filtering

See Filtering for more details.

Initialize Statistics

See Initialize Statistics for more details.

stat_df <- data.frame(
  parameters = character(),
  asv_count = integer(),
  read_count = integer(),
  sample_count = integer(),
  sample_replicate_count = integer()
)

stat_df <- get_stat(read_count_df, stat_df, stage = "input", params = NA)

Filtering ASVs

Denoising

See Denoising with SWARM for more details.

split_clusters <- TRUE
by_sample <- TRUE

outfile <- file.path(outdir, "4_filter", "2_swarm_denoise.csv")
read_count_df <- denoise_by_swarm(
  read_count_df,
  outfile=outfile,
  by_sample=by_sample,
  split_clusters=split_clusters
)

param <- paste("by_sample:", by_sample, "split_clusters:", split_clusters)
stat_df <- get_stat(read_count_df, stat_df, stage="swarm_denoise", params=param)

Filter ASVs Global

See Filter out ASVs with Low Total Read Count for more details.

global_read_count_cutoff = 2
outfile <- file.path(outdir, "4_filter", "3_filter_asv_global.csv")

read_count_df <- filter_asv_global(
  read_count_df, 
  outfile=outfile, 
  cutoff=global_read_count_cutoff
  )

param <- paste("global_read_count_cutoff:", global_read_count_cutoff)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_asv_global", params=param)

Filter ASVs Read Length

See Filter ASVs Based on Read Length for more details.

outfile <- file.path(outdir, "4_filter", "4_filter_indel.csv")

read_count_df <- filter_indel(
  read_count_df, 
  outfile=outfile
  )
stat_df <- get_stat(read_count_df, stat_df, stage="filter_indel", params=NA)

Filter ASVs Stop Codons

See Filter ASVs with Stop Codons for more details.

outfile <- file.path(outdir, "4_filter", "5_filter_stop_codon.csv")

read_count_df <- filter_stop_codon(
  read_count_df, 
  outfile=outfile
  )

stat_df <- get_stat(read_count_df, stat_df, stage="filter_stop_codon", params=NA)

Filter Contaminant ASVs

See Filter Potential Contaminant ASVs for more details.

outfile <- file.path(outdir, "4_filter", "6_filter_contaminant.csv")
conta_file <- file.path(outdir, "4_filter", "potential_contaminants.csv")

read_count_df <- filter_contaminant(
  read_count_df,
  sampleinfo=sampleinfo_df,
  conta_file=conta_file,
  outfile=outfile
)

stat_df <- get_stat(read_count_df, stat_df, stage="filter_contaminant", params=NA)

Filter Chimeric ASVs

See Filter Chimeric ASVs for more details.

outfile <- file.path(outdir, "4_filter", "7_filter_chimera.csv")

read_count_df <- filter_chimera(
  read_count_df,
  outfile=outfile
)

stat_df <- get_stat(read_count_df, stat_df, stage="filter_chimera", params=NA)

Filter Replicates

See Filter Aberrant Replicates for more details.

outfile <- file.path(outdir, "4_filter", "8_filter_replicate.csv")
renkonen_distance_quantile <- 0.9

read_count_df <- filter_replicate(
  read_count_df,
  renkonen_distance_quantile = renkonen_distance_quantile,
  outfile = outfile
)
#> [1] "The cutoff value for Renkonen distances is  0.43204653622422"

param <- paste("renkonen_distance_quantile:", renkonen_distance_quantile)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_replicate", params=param)

Filtering Occurrences

Create mock_composition File

See Create mock_composition File for more details.

outdir_mock <- file.path(outdir, "mock_composition")

mock_template <- match_variants_to_mock_species(
  read_count=read_count_df,
  fas=mock_ncbi_fasta,
  taxonomy=taxonomy,
  sampleinfo = sampleinfo_df,
  outdir= outdir_mock
)

mock_composition <- file.path(outdir_mock, "mock_composition_template_to_check.csv")
mock_composition_df <- read.csv(mock_composition)

Note:

The mock_composition_template_to_check.csv file contains only 5 of the 6 expected ASVs. Since the NCBI reference sequences were correctly chosen, this indicates that the ZFZP primer pair did not amplify Phoxinus phoxinus.

Because this sequence is absent from the mock_composition, it will not be counted as a false negative when evaluating performance. This is an artefact, and it should be kept in mind that it is in fact a false negative.

See Create mock_composition File for more details.

Filter Occurrences by Samples

See Filter Low-Abundance Occurrences within Samples for more details.

Suggest parameter value

outfile = file.path(outdir, "4_filter", "suggest_parameters", "sample_cutoff.csv")

suggest_sample_cutoff_df <- suggest_sample_cutoff(
  read_count=read_count_df,
  mock_composition=mock_composition,
  outfile=outfile
)

head(suggest_sample_cutoff_df)
#>   sample replicate action asv_id sample_cutoff read_count
#> 1  tpos1         3   keep      9        0.0001          2
#> 2  tpos1         1   keep      9        0.0004         14
#> 3  tpos1         2   keep      9        0.0004         10
#> 4  tpos1         2   keep     10        0.0017         40
#> 5  tpos1         1   keep     10        0.0024         69
#> 6  tpos1         3   keep     10        0.0024         46
#>   read_count_sample_replicate
#> 1                       18727
#> 2                       28405
#> 3                       22864
#> 4                       22864
#> 5                       28405
#> 6                       18727
#>                                                                                                                                                             asv
#> 1 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 2 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 3 TATCTGATCAGGTCTCGTAGGATCATCACTTAGATTTATTATTCGAATAGAATTAAGAACTCCTGGTAGATTTATTGGCAACGACCAAATTTATAACGTAATTGTTACATCTCATGCATTTATTATAATTTTTTTTATAGTTATACCAATCATAATT
#> 4 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#> 5 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#> 6 TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGAGCTGAACTTGGTCATGCGGGATCTTTAATCGGAGACGATCAAATTTACAATGTAATTGTTACTGCACACGCCTTTGTAATAATTTTTTTTATAGTTATACCTATTTTAATT
#>                        taxon
#> 1    Hydropsyche pellucidula
#> 2    Hydropsyche pellucidula
#> 3    Hydropsyche pellucidula
#> 4 Synorthocladius semivirens
#> 5 Synorthocladius semivirens
#> 6 Synorthocladius semivirens

Apply filter

sample_cutoff <- 0.0001
outfile <- file.path(outdir, "4_filter", "9_filter_occurrence_sample.csv")

read_count_df <- filter_occurrence_sample(
  read_count_df, 
  cutoff=sample_cutoff, 
  outfile=outfile)

param <- paste("cutoff:", sample_cutoff)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_occurrence_sample", params=param)

Filter Occurrences by Replicate

See Filter Occurrences by Replicate Consistency for more details.

min_replicate_number <- 2
outfile <- file.path(outdir, "4_filter", "10_filter_min_replicate.csv")

read_count_df <- filter_min_replicate(
  read_count_df, 
  cutoff=min_replicate_number, 
  outfile=outfile
  )

param <- paste("cutoff:", min_replicate_number)
stat_df <- get_stat(read_count_df, stat_df, stage="filter_min_replicate", params=param)

Filter Occurrences by Variant and Read count

See Filter Low-Abundance Occurrences (filter_occurrence_variant and filter_occurrence_read_count) for more details.

Suggest parameter values

outdir_suggest = file.path(outdir, "4_filter", "suggest_parameters")

suggest_variant_readcount_cutoffs_df <- suggest_variant_readcount_cutoffs(
  read_count_df,
  mock_composition = mock_composition,
  sampleinfo = sampleinfo_df,
  outdir= outdir_suggest,
  min_replicate_number=2
)

head(suggest_variant_readcount_cutoffs_df)
#>   read_count_cutoff variant_cutoff FN TP FP
#> 1                10          0.001  0  5  2
#> 2                10          0.002  0  5  2
#> 3                10          0.003  0  5  2
#> 4                10          0.004  0  5  2
#> 5                10          0.005  0  5  2
#> 6                10          0.006  0  5  2

Filter occurrences by variant

variant_cutoff = 0.001
outfile <- file.path(outdir, "4_filter", "11_filter_occurrence_variant.csv")

read_count_df_variant <- filter_occurrence_variant(read_count_df, cutoff=variant_cutoff, outfile=outfile)

Filter occurrences by read count

read_count_cutoff <- 10
outfile <- file.path(outdir, "4_filter", "12_filter_occurrence_read_count.csv")

read_count_df_rc <- filter_occurrence_read_count(
  read_count_df, 
  cutoff=read_count_cutoff, 
  outfile=outfile
  )

Pool results

outfile <- file.path(outdir, "4_filter", "13_pool_filters.csv")

read_count_df <- pool_filters(
  read_count_df_rc, 
  read_count_df_variant, 
  outfile=outfile
  )

rm(read_count_df_rc)
rm(read_count_df_variant)

Filter Occurrences by Replicate Again

See Filter Occurrences by Replicate Consistency for more details.

min_replicate_number <- 2
outfile <- file.path(outdir, "4_filter", "14_filter_min_replicate.csv")

read_count_df <- filter_min_replicate(
  read_count_df, 
  cutoff=min_replicate_number, 
  outfile=outfile
  )

Final steps

Pool replicates

See Pool replicates for more details.

outfile <- file.path(outdir, "4_filter", "15_pool_replicates.csv")

read_count_sample_df <- pool_replicates(
  read_count_df,
  outfile = outfile
)

Performance Metrics

See Performance Metrics for more details.

false_negatives <- file.path(outdir, "4_filter", "false_negatives.csv")
performance_metrics <- file.path(outdir, "4_filter", "performance_metrics.csv")
known_occurrences <- file.path(outdir, "4_filter", "known_occurrences.csv")

results <- classify_control_occurrences(
  read_count_sample_df, 
  sampleinfo=sampleinfo_df, 
  mock_composition=mock_composition, 
  known_occurrences=known_occurrences, 
  false_negatives=false_negatives,
  performance_metrics=performance_metrics
  )

head(read.csv(performance_metrics))
#>   TP FP FN Precision Sensitivity
#> 1  5  2  0 0.7142857           1

Note: Keep in mind that one expected sequence in the mock is not present in the mock_composition. Therefore, the true number of false negatives is 1, and the sensitivity is:

TP / (TP + FN) = 5 / (5 + 1) = 0.833

Taxonomic Assignment

See Taxonomic Assignment for more details.

outfile <- file.path(outdir, "4_filter", "assign_taxonomy_ltg.csv")

asv_tax <- assign_taxonomy_ltg(
  asv=read_count_sample_df,
  taxonomy=taxonomy,
  blast_db=blast_db
)

Clustering (mOTUs)

See Clustering (mOTUs) for more details.

outfile <- file.path(outdir, "5_cluster", "16_mOTU_vsearch.csv")

read_count_samples_df_ClusterSize <- cluster_asv(
  read_count_sample_df,
  method = "swarm",
  swarm_d  = 7,
  outfile=outfile
)

Output table

See Output table for more details.

outfile=file.path(outdir, "motu_table_taxa.csv")

asv_table_df <- write_asv_table(
  read_count_sample_df,
  outfile=outfile,
  asv_tax=asv_tax,
  sampleinfo=sampleinfo_df
)