Please enable JavaScript.

Coggle requires JavaScript to display documents.

BamProcessing.wdl, task BaseRecalibrator {, task ApplyBQSR {, BQSR -…

- - - - awk -v OFS='\t' '$2=$2-1' |
      - /app/bedtools intersect -c -a ~{contamination_sites_bed} -b - |
      - cut -f6 > ~{target_overlap_counts}
    - - print lines from whole-genome file from loci with non-zero overlap
      - with target intervals
      - WGS_FILE=$1
      - EXOME_FILE=$2
      - paste ~{target_overlap_counts} $WGS_FILE |
        
        grep -Ev "^0" |
        
        cut -f 2- > $EXOME_FILE
      - echo "Generated $EXOME_FILE"
- - - - GatherBamFiles \
      - INPUT=~{sep=' INPUT=' input_bams} \
      - OUTPUT=~{output_bam_basename}.bam \
      - CREATE_INDEX=true \
      - CREATE_MD5_FILE=true
- - - - GatherBamFiles \
      - INPUT=~{sep=' INPUT=' input_bams} \
      - OUTPUT=~{output_bam_basename}.bam \
      - CREATE_INDEX=false \
      - CREATE_MD5_FILE=false
- - - - But first, it is divided by an underestimation factor thusly:
        
        float(FREEMIX) / ContaminationUnderestimationFactor
        
        where the denominator is hardcoded in Zamboni:
        
        val ContaminationUnderestimationFactor = 0.75f
      - Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted contamination estimate for use in variant calling
      - Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER
      - In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM
      - The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId
  - - - First row are the keys (e.g., SEQ_SM, RG, FREEMIX), second row are the associated values
      - creates a ~{output_prefix}.selfSM file, a TSV file with 2 rows, 19 columns.
    - - --Verbose \
      - --NumPC 4 \
      - --Output ~{output_prefix} \
      - --BamFile ~{input_bam} \
      - --Reference ~{ref_fasta} \
      - --UDPath ~{contamination_sites_ud} \
      - --MeanPath ~{contamination_sites_mu} \
      - --BedPath ~{contamination_sites_bed} \
      - ~{true="--DisableSanityCheck" false="" disable_sanity_check} \
      - 1>/dev/null
    - - used to read from the selfSM file and calculate contamination, which gets printed out
    - - reader = csv.DictReader(selfSM, delimiter='\t')
      - i = 0
      - for row in reader:
        
        if float(row["FREELK0"])==0 and float(row["FREELK1"])==0:
        
        sys.stderr.write("Found zero likelihoods. Bam is either very-very shallow, or aligned to the wrong reference (relative to the vcf).")
        
        sys.exit(1)
        
        #
        
        a zero value for the likelihoods implies no data. This usually indicates a problem rather than a real event.
        
        if the bam isn't really empty, this is probably due to the use of a incompatible reference build between
        
        vcf and bam.
        
        print(float(row["FREEMIX"])/~{contamination_underestimation_factor})
        
        i = i + 1
        
        if i != 1:
        
        sys.stderr.write("Found %d rows in .selfSM file. Was expecting exactly 1. This is an error"%(i))
        
        sys.exit(2)
        
        #
        
        there should be exactly one row, and if this isn't the case the format of the output is unexpectedly different and the results are not reliable.
- - - - Array[File] input_bams
      - String output_bam_basename
      - String metrics_filename
      - Float total_input_size
      - Int compression_level
      - Int preemptible_tries
      - The program default for READ_NAME_REGEX is appropriate in nearly every case.
      - Sometimes we wish to supply "null" in order to turn off optical duplicate detection
      - This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing
      - String? read_name_regex
      - Int memory_multiplier = 1
      - Int additional_disk = 20
    - - java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/gitc/picard.jar \
        
        MarkDuplicates \
        
        INPUT=~{sep=' INPUT=' input_bams} \
        
        OUTPUT=~{output_bam_basename}.bam \
        
        METRICS_FILE=~{metrics_filename} \
        
        VALIDATION_STRINGENCY=SILENT \
        
        ~{"READ_NAME_REGEX=" + read_name_regex} \
        
        OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
        
        ASSUME_SORT_ORDER="queryname" \
        
        CLEAR_DT="false" \
        
        ADD_PG_TAG_TO_READS=false
    - - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330"
      - preemptible: preemptible_tries
      - memory: "~{memory_size} GiB"
      - disks: "local-disk " + disk_size + " HDD"
    - - File output_bam = "~{output_bam_basename}.bam"
      - File duplicate_metrics = "~{metrics_filename}"
  - - - Array[File] input_bams
      - String output_bam_basename
      - String metrics_filename
      - Float total_input_size
      - Int compression_level
      - Int preemptible_tries
      - String? read_name_regex
      - Int memory_multiplier = 3
      - Int cpu_size = 6
    - - set -e
      - export GATK_LOCAL_JAR=/root/gatk.jar
      - gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xmx~{java_memory_size}g" \
        
        MarkDuplicatesSpark \
        
        --input ~{sep=' --input ' input_bams} \
        
        --output ~{output_bam_location} \
        
        --metrics-file ~{metrics_filename} \
        
        --read-validation-stringency SILENT \
        
        ~{"--read-name-regex " + read_name_regex} \
        
        --optical-duplicate-pixel-distance 2500 \
        
        --treat-unsorted-as-querygroup-ordered \
        
        --create-output-bam-index false \
        
        -- --conf spark.local.dir=/mnt/tmp --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false'
    - - docker: "jamesemery/gatknightly:gatkMasterSnapshot44ca2e9e84a"
      - disks: "/mnt/tmp " + ceil(2.1 * total_input_size) + " LOCAL, local-disk " + disk_size + " HDD"
      - bootDiskSizeGb: "50"
      - cpu: cpu_size
      - memory: "~{memory_size} GiB"
      - preemptible: preemptible_tries
    - - File output_bam = output_bam_location
      - File duplicate_metrics = metrics_filename
  - - - Array[File] input_bqsr_reports
      - String output_report_filename
      - Int preemptible_tries
      - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
    - - gatk --java-options "-Xms3000m" \
        
        GatherBQSRReports \
        
        -I ~{sep=' -I ' input_bqsr_reports} \
        
        -O ~{output_report_filename}
      - }
    - - docker: gatk_docker
      - preemptible: preemptible_tries
      - memory: "3500 MiB"
      - disks: "local-disk 20 HDD"
    - - File output_bqsr_report = "~{output_report_filename}"
- - - - File input_bam
      - String output_bam_basename
      - Int preemptible_tries
      - Int compression_level
    - - more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
      - SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
    - - java -Dsamjdk.compression_level=~{compression_level} -Xms4000m -jar /usr/gitc/picard.jar \
        
        SortSam \
        
        INPUT=~{input_bam} \
        
        OUTPUT=~{output_bam_basename}.bam \
        
        SORT_ORDER="coordinate" \
        
        CREATE_INDEX=true \
        
        CREATE_MD5_FILE=true \
        
        MAX_RECORDS_IN_RAM=300000
    - - docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330"
      - disks: "local-disk " + disk_size + " HDD"
      - cpu: "1"
      - memory: "5000 MiB"
      - preemptible: preemptible_tries
    - - File output_bam = "~{output_bam_basename}.bam"
      - File output_bam_index = "~{output_bam_basename}.bai"
      - File output_bam_md5 = "~{output_bam_basename}.bam.md5"
    - - Sort BAM file by coordinate order
  - - - Sort BAM file by coordinate order -- using Spark!
    - - File input_bam
      - String output_bam_basename
      - Int preemptible_tries
      - Int compression_level
      - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.0.10.1"
    - - more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
      - SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
    - - set -e
      - gatk --java-options "-Dsamjdk.compression_level=~{compression_level} -Xms100g -Xmx100g" \
        
        SortSamSpark \
        
        -I ~{input_bam} \
        
        -O ~{output_bam_basename}.bam \
        
        -- --conf spark.local.dir=. --spark-master 'local[16]' --conf 'spark.kryo.referenceTracking=false'
      - samtools index ~{output_bam_basename}.bam ~{output_bam_basename}.bai
    - - docker: gatk_docker
      - disks: "local-disk " + disk_size + " HDD"
      - bootDiskSizeGb: "15"
      - cpu: "16"
      - memory: "102 GiB"
      - preemptible: preemptible_tries
    - - File output_bam = "~{output_bam_basename}.bam"
      - File output_bam_index = "~{output_bam_basename}.bai"