This page lists all the commands which were used to perform the tasks in the tutorial. Commands like column -t
which serve only formatting issues are omitted.
# Prepare reference genome
samtools faidx input/genome/volpertinger.fasta
# Generate endogenous reads
scripts/uniform input/genome/volpertinger.fasta \
--seed 1234 \
--name volpertinger_ \
--output-fastq data/2/volpertinger.coord data/2/volpertinger.fastq \
25 20 5
# Generate exogenous reads
scripts/uniform input/retli/retli_tr.fasta \
--seed 1235 \
--name retli_ \
--output-fastq data/2/retli.coord data/2/retli.fastq \
50 20 5
# Introduce mutations into endogenous reads
scripts/filter_fastq --nucleotide \
@ scripts/multiple_mutate --seed 123 input/mut-tables/mut.tab @ \
< data/2/volpertinger.fastq \
> data/3/volpertinger_mut.fastq
# Combine endogenous and exogenous reads
cat data/3/volpertinger_mut.fastq data/2/retli.fastq \
> data/3/all.fastq
# Generate the table of mapping parameters per run
scripts/cross_tab --head 1 input/mapping/*.par | \
scripts/index_column --colname runidx > data/4/partab
# Convert the mapping run parameter table into executable calls
scripts/table2calls data/4/partab \
input/mapping/map-bwa.sh \
> data/4/calls
# Start parallel mapping runs
scripts/mcall -c data/4/calls -t 2 \
--status
# Determine sensitivity and specificity for all generated SAM files
for sam in data/4/*.sam; do
# Generate output prefix p from input name: `4.sam` -> `4`
bn=$(basename $sam)
p=${bn%.sam}
# Convert SAM to table
scripts/sam2table data/4/${p}.sam > data/5/${p}.tab
# Using the FASTA record names from the SAM file, obtain the organism
# names where they stem from. This enables calculating statistics for
# endogenous and exogenous reads, separately.
scripts/add_mapped_organisms \
--endogenous volpertinger \
input/genome/volpertinger.fasta.fai \
data/2/volpertinger.coord \
--exogenous retli \
input/retli/retli.fasta.fai \
data/2/retli.coord \
data/5/${p}.tab \
| scripts/write_later data/5/${p}.tab
# Determine whether each read was correctly mapped. Adds column "correct"
scripts/pocketR '
within(input, {
correct =
mapped_pos == true_start &
mapped_rname == true_record &
mapped_organism == true_organism })
' data/5/${p}.tab \
| scripts/write_later data/5/${p}.tab
# Count reads per origin/target organism and mapping status
scripts/pocketR '
aggregate( cbind(count=qname) ~ true_organism + mapped_organism + correct,
FUN=length, data=input) ' \
data/5/${p}.tab \
> data/5/${p}.agg
# Plot mapping targets per origin organism
scripts/plot_read_fate --exogenous retli \
true_organism mapped_organism \
correct count \
data/5/${p}.pdf data/5/${p}.agg
# Calculate sensitivity, specificity and balanced accuracy
scripts/sensspec --c-morg mapped_organism \
--c-torg true_organism \
data/5/${p}.agg volpertinger \
> data/5/${p}.performance
echo "$sam done. -> Generated data/5/${p}.{tab,agg,pdf,performance}"
done
# Add the run number to each .performance file
for f in data/5/*.performance; do
echo $f
i=$(basename ${f%.performance})
scripts/add_const_column "$f" runidx "$i" \
> "data/6/${i}.performance"
done
# Concatenate all tables, but print the header line only once.
scripts/cat_tables data/6/*.performance \
> data/6/performance
# Lookup the mapping run parameters for each run and merge them with the
# outcomes documented in data/6/performance
scripts/merge -a data/6/performance runidx \
-b data/4/partab runidx \
--all-a-cols --all-b-cols --all-a \
| scripts/write_later data/6/performance
# Plot BWA parameter -n versus BCR
scripts/plot_parameter_effects --signif 1 data/6/performance n bcr \
data/6/n.pdf
# Plot BWA parameter -k versus BCR
scripts/plot_parameter_effects --signif 1 data/6/performance k bcr \
data/6/k.pdf