Appendix A1 -- Tutorial script

This page lists all the commands which were used to perform the tasks in the tutorial. Commands like column -t which serve only formatting issues are omitted.
# Prepare reference genome
samtools faidx input/genome/volpertinger.fasta

# Generate endogenous reads
scripts/uniform input/genome/volpertinger.fasta \
    --seed 1234 \
    --name volpertinger_ \
    --output-fastq data/2/volpertinger.coord data/2/volpertinger.fastq \
    25 20 5

# Generate exogenous reads
scripts/uniform input/retli/retli_tr.fasta \
    --seed 1235 \
    --name retli_ \
    --output-fastq data/2/retli.coord data/2/retli.fastq \
    50 20 5

# Introduce mutations into endogenous reads
scripts/filter_fastq --nucleotide \
  @ scripts/multiple_mutate --seed 123 input/mut-tables/mut.tab @ \
  < data/2/volpertinger.fastq \
  > data/3/volpertinger_mut.fastq

# Combine endogenous and exogenous reads
cat data/3/volpertinger_mut.fastq data/2/retli.fastq \
    > data/3/all.fastq

# Generate the table of mapping parameters per run
scripts/cross_tab --head 1 input/mapping/*.par | \
scripts/index_column --colname runidx > data/4/partab

# Convert the mapping run parameter table into executable calls
scripts/table2calls data/4/partab \
                    input/mapping/map-bwa.sh \
                  > data/4/calls

# Start parallel mapping runs
scripts/mcall -c data/4/calls -t 2 \
              --status

# Determine sensitivity and specificity for all generated SAM files
for sam in data/4/*.sam; do
    # Generate output prefix p from input name: `4.sam` -> `4`
    bn=$(basename $sam)
    p=${bn%.sam}

    # Convert SAM to table
    scripts/sam2table data/4/${p}.sam > data/5/${p}.tab

    # Using the FASTA record names from the SAM file, obtain the organism 
    # names where they stem from. This enables calculating statistics for
    # endogenous and exogenous reads, separately.
    scripts/add_mapped_organisms \
        --endogenous volpertinger \
                     input/genome/volpertinger.fasta.fai \
                     data/2/volpertinger.coord \
        --exogenous  retli \
                     input/retli/retli.fasta.fai \
                     data/2/retli.coord \
        data/5/${p}.tab \
        | scripts/write_later data/5/${p}.tab


    # Determine whether each read was correctly mapped. Adds column "correct"
    scripts/pocketR '
        within(input, { 
            correct =    
                mapped_pos == true_start  &
                mapped_rname == true_record &
                mapped_organism == true_organism })
    '  data/5/${p}.tab \
    | scripts/write_later data/5/${p}.tab

    # Count reads per origin/target organism and mapping status
    scripts/pocketR '
        aggregate( cbind(count=qname) ~ true_organism + mapped_organism + correct,
            FUN=length, data=input) ' \
    data/5/${p}.tab \
    > data/5/${p}.agg

    # Plot mapping targets per origin organism 
    scripts/plot_read_fate    --exogenous retli \
                              true_organism    mapped_organism \
                              correct          count \
                              data/5/${p}.pdf  data/5/${p}.agg

    # Calculate sensitivity, specificity and balanced accuracy
    scripts/sensspec --c-morg mapped_organism \
                     --c-torg true_organism \
                     data/5/${p}.agg volpertinger \
        > data/5/${p}.performance

    echo "$sam done. -> Generated data/5/${p}.{tab,agg,pdf,performance}"

done

# Add the run number to each .performance file
for f in data/5/*.performance; do
    echo $f
    i=$(basename ${f%.performance})

    scripts/add_const_column "$f" runidx "$i" \
        > "data/6/${i}.performance"
done

# Concatenate all tables, but print the header line only once.
scripts/cat_tables data/6/*.performance \
                 > data/6/performance

# Lookup the mapping run parameters for each run and merge them with the
# outcomes documented in data/6/performance
scripts/merge -a data/6/performance runidx \
              -b data/4/partab      runidx \
              --all-a-cols --all-b-cols --all-a \
           | scripts/write_later data/6/performance


# Plot BWA parameter -n versus BCR
scripts/plot_parameter_effects --signif 1 data/6/performance n bcr \
    data/6/n.pdf

# Plot BWA parameter -k versus BCR
scripts/plot_parameter_effects --signif 1 data/6/performance k bcr \
    data/6/k.pdf