All configurable options

This is an automatically generated summary of all configurable options for seq2science. These options are loosely grouped around workflows/topics, however they are generally also shared across workflows. So it is possible that tunable configuration settings are not mentioned in their topic. At the start of each seq2science run the complete configuration is printed to stdout. You can use that printed configuration as the complete list of tunable configuration settings.

We believe that all our default settings are reasonable, and manual finetuning is generally not required.

General

samples

samples:
    description: tab-separated file specifying the samples to run through the workflow
    default: samples.tsv
    type: string

rule_dir

rule_dir:
    description: directory where to find the workflow rules
    default: ../../rules
    type: string

result_dir

result_dir:
    description: where to store the pipeline's results (by default)
    default: ./results
    type: string

genome_dir

genome_dir:
    description: where to store the pipeline's genome assemblies
    default: ./genomes
    type: string

log_dir

log_dir:
    description: directory where to store the logs (defaults inside the result dir)
    default: log
    type: string

benchmark_dir

benchmark_dir:
    description: directory where to store the logs (defaults inside the result dir)
    default: benchmark
    type: string

fqext1

fqext1:
    description: filename suffix when handling paired-end data, describing the forward read
    default: R1
    type: string

fqext2

fqext2:
    description: filename suffix when handling paired-end data, describing the reverse read
    default: R2
    type: string

fqsuffix

fqsuffix:
    description: file descriptor for fastq files (often fastq or fq)
    default: fastq
    type: string

cpulimit

cpulimit:
    description: whether or not to make use of cpulimit to enfoce max thread usage for some rules
    default: True
    type: boolean

email

email:
    description: email to reach you after pipeline finished, required for trackhub and multiqc report
    type: string

niceness

niceness:
    description: with which niceness to run the shell commands. Ranges between 0-19, with 19 being the nicest you can set
    type: integer

Download

sra_dir

sra_dir:
    description: directory where to store the workflow sras (defaults inside the result dir)
    default: sra
    type: string

fastq_dir

fastq_dir:
    description: directory where to store the workflow fastqs (defaults inside the result dir)
    default: fastq
    type: string

ncbi_key

ncbi_key:
    description: e-utilities key for faster lookup interval
    type: string

ncbi_requests

ncbi_requests:
    description: the number of lookups per second the key permits
    type: integer

ascp_path

ascp_path:
    description: the pipeline supports downloading through ascp from ena if this variable is set.
    type: string

ascp_key

ascp_key:
    description: the public key of ascp
    type: string

keep_downloaded_fastq

keep_downloaded_fastq:
    description: whether or not to remove (raw) fastqs after downloading.
    default: False
    type: boolean

Alignment general

trimmed_dir

trimmed_dir:
    description: directory where to store the workflow trimmed fastqs (defaults inside the result dir)
    default: fastq_trimmed
    type: string

qc_dir

qc_dir:
    description: directory where to store the workflow quality controls (defaults inside the result dir)
    default: qc
    type: string

final_bam_dir

final_bam_dir:
    description: directory where to store the workflow's final deduplicated & sieved bams (defaults inside the result dir)
    default: final_bam
    type: string

trimmer

trimmer:
    description: which adapter trimmer to use
    properties:
        trimgalore:
            description: trim galore! settings (note that the adapter is autodetected, so you do not have to set it here)
            properties:
                trimoptions:
                    default: --quality 10 --length 20 --consider_already_trimmed 10
            default:
                trimoptions: --quality 10 --length 20 --consider_already_trimmed 10
        fastp:
            description: fastp settings (note that the adapter is autodetected, so you do not have to set it here)
            properties:
                trimoptions:
                    default:
            default:
                trimoptions:
    default:
        fastp:
            trimoptions:
    type: object

technical_replicates

technical_replicates:
    description: concatenates samples depending on the names given in the 'replicates' column of samples.tsv
    default: merge
    enum: ['merge', 'keep']

biological_replicates

biological_replicates:
    description: which method to use to combine replicates (fisher (n >=2), idr (n==2), or keep)
    default: keep
    enum: ['fisher', 'idr', 'keep']
    type: string

provider

provider:
    description: Specify provider to download new genomes from. Will try GENCODE > Ensembl > UCSC > NCBI if left blank.
    enum: ['GENCODE', 'Ensembl', 'UCSC', 'NCBI']
    type: string

aligner

aligner:
    description: which aligner to use
    properties:
        bowtie2:
            properties:
                index:
                    default:
                align:
                    default:
            default:
                index:
                align:
        bwa-mem:
            properties:
                index:
                    default: -a bwtsw
                align:
                    default: -M
            default:
                index: -a bwtsw
                align: -M
        bwa-mem2:
            properties:
                index:
                    default:
                align:
                    default: -M
            default:
                index:
                align:
        hisat2:
            properties:
                index:
                    default:
                align:
                    default:
            default:
                index:
                align:
        minimap2:
            properties:
                index:
                    default:
                align:
                    default: -a sr --secondary=no
            default:
                index:
                align:
        star:
            properties:
                index:
                    default: --limitGenomeGenerateRAM 37000000000 --genomeSAsparseD 1
                align:
                    default:
            default:
                index: --limitGenomeGenerateRAM 37000000000 --genomeSAsparseD 1
                align:
    default:
        bwa-mem2:
            index:
            align:

samtools_index

samtools_index:
    description: samtools index settings
    default:
    type: string

markduplicates

markduplicates:
    description: the parameters of picard markduplicates
    default: -Xms4G -Xmx6G MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=999
    type: string

min_mapping_quality

min_mapping_quality:
    description: minimum mapq to keep aligned reads
    default: 0
    type: integer

tn5_shift

tn5_shift:
    description: whether or not to shift reads to adjust for the tn5 bias
    default: False
    type: boolean

remove_blacklist

remove_blacklist:
    description: whether or not to remove reads in the encode blacklisted regions
    default: False
    type: boolean

remove_mito

remove_mito:
    description: whether or not to remove reads aligned to the mitochondria
    default: False
    type: boolean

only_primary_align

only_primary_align:
    description: whether or not to only keep primary aligned reads (no multimappers)
    default: False
    type: boolean

remove_dups

remove_dups:
    description: whether or not to remove duplicate reads (either optical or PCR)
    default: False
    type: boolean

create_qc_report

create_qc_report:
    description: whether or not to generate a multiqc report
    default: True
    type: boolean

bam_sort_mem

bam_sort_mem:
    description: The max memory used in gigabytes by samtools/sambamba when sorting bams
    default: 2
    type: integer

store_as_cram

store_as_cram:
    description: whether to store the final mapped and filtered reads as cram instead of bam
    type: boolean
    default: False

custom_genome_extension

custom_genome_extension:
    description: File(s) to append to the genome (must be in fasta format)

custom_annotation_extension

custom_annotation_extension:
    description: File(s) to append to the gene annotations (must be in GTF format)

custom_assembly_suffix

custom_assembly_suffix:
    description: What to append to the assembly name to indicate that custom data has been added to either genome or assembly.
    default: _custom
    type: string

deeptools_qc

deeptools_qc:
    description: Whether or not to perform deeptools QC. With many samples this can take a long time and might be undesirable
    default: True
    type: boolean

deeptools_computematrix_gene

deeptools_computematrix_gene:
    description: Deeptools computeMatrix params
    default: --beforeRegionStartLength 3000 --regionBodyLength 5000 --afterRegionStartLength 3000
    type: string

deeptools_plotcorrelation

deeptools_plotcorrelation:
    description: Deeptools plotCorrelation params
    default: --colorMap RdYlBu_r --plotNumbers
    type: string

deeptools_multibamsummary

deeptools_multibamsummary:
    description: Deeptools multiBamSummary params
    default: --distanceBetweenBins 9000 --binSize 1000
    type: string

min_template_length

min_template_length:
    description: the minimum template length for paired-end reads
    type: integer

max_template_length

max_template_length:
    description: the maximum template length for paired-end reads
    type: integer

subsample

subsample:
    description: the maximum amount of reads allowed in the final bam file. When setting a value lower than zero, no subsampling happens (default).
    type: integer
    default: -1

Workflow: Alignment

bam_sorter

bam_sorter:
    description: which sorting software to use
    type: object
    properties:
        description: sort order of aligned reads
        default: coordinate
        enum: ['queryname', 'coordinate']
        type: string
    default:
        samtools: coordinate

Workflow: ChIP & ATAC-seq

peak_caller

peak_caller:
    type: object
    properties:
        genrich:
            type: string
        hmmratac:
            type: string
        macs2:
            type: string
    minProperties: 1
    additionalProperties: False
    default:
        macs2: --shift -100 --extsize 200 --nomodel --keep-dup 1 --buffer-size 10000

macs2_keep_mates

macs2_keep_mates:
    description: macs2 normally removes paired-end mates when doing --nomodel shift peak calling. When this flag is on, the paired-end bam is converted to sngle-end bam, so all reads are counted. This is not conventional to do, but might help with low sequencing depth.
    default: False
    type: boolean

peak_windowsize

peak_windowsize:
    description: when merging narrowpeaks, how much each summit should be extended by (on each side) when considering overlapping peaks. If for instance a peak_windowsize of 100 is chosen, all peaks within 100 bps of each other will be merged.
    default: 100
    type: integer

slop

slop:
    description: how much each summit is extended by (on both sides) when making the final peak. For e.g. count tables it is important that all peaks are of the same size. If for instance a slop of 100 is chosen, the final peak width (for all peaks) is 200.
    type: integer
    default: 100

logbase

logbase:
    description: The log base of the number of reads under peak count tables
    default: 2
    type: number

heatmap_slop

heatmap_slop:
    description: how much the summits should be extended by (on each side) for making a heatmap of peaks in the multiQC report. If for instance a slop of 1000 is chosen, the final peak's width is 2000.
    default: 1000
    type: integer

heatmap_npeaks

heatmap_npeaks:
    description: The number of random peaks used for the heatmap. This at most visualizes all peaks found, so picking a number higher than the number of peaks has no effect. The reason to pick a smaller number is to limit computational resources.
    default: 20000
    type: integer

deeptools_heatmap_options

deeptools_heatmap_options:
    description: the options passed to deeptools when visualizing the heatmap
    default: --kmeans 6 --xAxisLabel "Summit distance (bp)"
    type: string

idr_options

idr_options:
    description: the options passed to idr when combining peaks
    default:
    type: string

run_gimme_maelstrom

run_gimme_maelstrom:
    description: whether or not to run gimme maelstrom on the consensus peakset.
    default: False
    type: boolean

gimme_maelstrom_database

gimme_maelstrom_database:
    description: which motif database to run gimme maelstrom with.
    default: gimme.vertebrate.v5.0
    type: string

gimme_maelstrom_params

gimme_maelstrom_params:
    description: the settings with which gimme maelstrom is run.
    default:
    type: string

infer_motif2factors

infer_motif2factors:
    description: whether or not seq2science should try to infer the motif-TF relationships based on orthology
    default: True
    type: boolean

motif2factors_database_references

motif2factors_database_references:
    description: On which assembly(s) the original gimme motif2factors is based. Defaults to human and mouse.
    default: ['GRCh38.p13', 'GRCm38.p6']
    type: array
    items:
        type: string

motif2factors_reference

motif2factors_reference:
    description: Which assemblies should be taken along for the orthology inference.
    default: ['danRer11', 'UCB_Xtro_10.0', 'galGal6', 'BraLan3', 'oryLat2', 'ARS-UCD1.2', 'phaCin_unsw_v4.1', 'rCheMyd1.pri']
    type: array
    items:
        type: string

Workflow: RNA-seq

quantifier

quantifier:
    description: which quantifier to use
    properties:
        salmon:
            properties:
                quantifier_index:
                    default: -k 31
                quantifier_flags:
                    default: --seqBias --gcBias --posBias --validateMappings --recoverOrphans
                quantifier_decoys:
                    description: improve quantification accuracy at the cost of increased memory usage.
                    enum: ['none', 'partial', 'full']
                    type: string
                    default: full
            default:
                quantifier_index: -k 31
                quantifier_flags: --seqBias --gcBias --posBias --validateMappings --recoverOrphans
                quantifier_decoys: full
        htseq:
            properties:
                htseq_flags:
                    default:
            default:
                htseq_flags:
        featurecounts:
            properties:
                featurecounts_flags:
                    default:
            default:
                featurecounts_flags:
    default:
        htseq:
            htseq_flags:

tpm2counts

tpm2counts:
    description: how do you wish to convert TPMs to gene counts?
    properties:
        tximeta:
            description: Tximeta uses Ensembl/GENCODE assemblies to convert TPMs to gene counts, and produces an additional R object with the various contents.
            properties:
                txi_source:
                    description: the source of transcriptome
                    enum: ['Ensembl', 'GENCODE', 'de-novo']
                    default: de-novo
                txi_organism:
                    description: organism (e.g. "Homo sapiens")
                    default: tximeta_file
                txi_release:
                    description: release number of the source database (e.g. "101")
                    default: 42
            default:
                txi_source: de-novo
                txi_organism: tximeta_file
                txi_release: 42
        pytxi:
            description: Pytxi uses genomepy assemblies to convert TPMs to gene counts (using either the GTF file or MyGene.info).
            properties:
                tx2gene_from_gtf:
                    description: try to convert to gene names (symbols) using the GTF file. If false (or the GTF cannot help), pytxi uses MyGene.info.
                    type: boolean
                    default: True
            default:
                tx2gene_from_gtf: True
    default:
        pytxi:
            tx2gene_from_gtf: True

ignore_strandedness

ignore_strandedness:
    description: ignore/dont infer strand-specificity?
    default: False
    type: boolean

dexseq

dexseq:
    description: output an exon counts matrix for use in DEXSeq.
    default: False
    type: boolean

Workflow: Single-cell ATAC-seq

snaptools_opt

snaptools_opt:
    description: snaptools optional arguments for snaptools snap-pre, from snaptools (https://github.com/r3fang/SnapTools). For more info run snaptools snap-pre -h
    default: --min-flen=0 --max-flen=1000 --keep-single=FALSE --keep-secondary=FALSE --overwrite=True --min-cov=100 --verbose=True

bin_opt

bin_opt:
    description: snaptools optional arguments for snaptools snap-add-bmat, from snaptools (https://github.com/r3fang/SnapTools). The genome will be divided into bins of the equal size of --bin-size-list to create the cell x bin count matrix. For more info run snaptools snap-add-bmat -h
    default: --bin-size-list 5000 --verbose=True

Workflow: Single-cell RNA-seq

fastq_clean_dir

fastq_clean_dir:
    description: directory where to find the workflow rules
    default: fastq_cleaned
    type: string

fastq-pair

fastq-pair:
    description: fastq synchronization
    default:

quantifier

quantifier:
    description: which quantifier to use
    properties:
        citeseqcount:
            properties:
                count:
                    default: -cbf 9 -cbl 16 -umif 1 -umil 8 -cells 372 --max-error 1 --bc_collapsing_dist 1 --umi_collapsing_dist 1
        kallistobus:
            properties:
                ref:
                    default: --workflow lamanno
                count:
                    default: -x 10XV3 --verbose --h5ad --workflow lamanno
    default:
        kallistobus:
            ref: --workflow lamanno
            count: -x 10XV3 --verbose --h5ad --workflow lamanno

barcodefile

barcodefile:
    type: string

sc_preprocess

sc_preprocess:
    description: Pre-processing settings for scRNA-seq data
    properties:
        export_sce_objects:
            description: Export scRNA-seq UMI count tables to SingleCellExperiment objects and save in .Rds file format
            default: False
            type: boolean
        run_sctk_qc:
            description: Run singleCellTK quality control workflow
            default: False
            type: boolean
        velo_assay:
            description: Count assay to use for export and quality control when kb is run with --workflow lamanno
            default: spliced
            enum: ['spliced', 'unspliced']
        sctk_data_type:
            description: Type of UMI count matrix, either cell or droplet counts
            default: cell
            enum: ['cell', 'droplet']
        sctk_detect_mito:
            description: Calculate mitochondrial gene ratio for quality control
            default: True
            type: boolean
        sctk_mito_set:
            description: Mitochondrial gene set to use for quality control
            default: human-symbol
            enum: ['human-ensembl', 'mouse-ensembl', 'human-entrez', 'mouse-entrez', 'human-symbol', 'mouse-symbol']
        sctk_detect_cell:
            description: Perform cell calling for droplet based scRNA-seq assays
            default: True
            type: boolean
        sctk_cell_calling:
            description: Cell calling method to use
            default: Knee
            enum: ['EmptyDrops', 'Knee']
        sctk_export_formats:
            description: File formats for SingleCellExperiment object export
            default: ['Seurat']
            type: array
        sctk_qc_algos:
            description: QC algorithms for CellQC (debug only)
            default: ['QCMetrics', 'scDblFinder', 'decontX']
            type: array
        use_alt_expr:
            description: Process alternative experiments (if present) such as ERCC Spike-in's
            default: False
            type: boolean
        alt_exp_reg:
            description: Regular expression for alternative feature filtering from gene identifiers
            default: ERCC-*
            type: string
        alt_exp_name:
            description: Name for the alternative assay to store along SCE object
            default: ALT
            type: string

Differential gene/peak analysis

counts_dir

counts_dir:
    description: directory where to store the counts for DESeq2 analysis (defaults inside the result dir)
    default: counts
    type: string

deseq2_dir

deseq2_dir:
    description: directory where to store the DESeq2 output (defaults inside the result dir)
    default: deseq2
    type: string

deseq2

deseq2:
    description: DESeq2 settings. See the DESeq2 vignette for details
    properties:
        multiple_testing_procedure:
            default: BH
            enum: ['BH', 'IHW']
            type: string
        alpha_value:
            default: 0.1
            type: number
        shrinkage_estimator:
            default: apeglm
            enum: ['apeglm', 'ashr', 'normal']
            type: string
        single_cell:
            default: False
            type: boolean
    default:
        multiple_testing_procedure: BH
        alpha_value: 0.1
        shrinkage_estimator: apeglm
        single_cell: False

contrasts

contrasts:
    description: deseq2 design contrast(s)

Trackhub

create_trackhub

create_trackhub:
    description: generate a trackhub (and required files)?
    default: True
    type: boolean

trackhub_dir

trackhub_dir:
    description: directory where to store the trackhub (defaults inside the result dir)
    default: trackhub
    type: string

bigwig_dir

bigwig_dir:
    description: directory where to store the bigwigs used by the trackhub (defaults inside the result dir)
    default: bigwigs
    type: string

deeptools_bamcoverage

deeptools_bamcoverage:
    description: flags for bam to bigwig conversions with deeptools bamCoverage
    default: --normalizeUsing BPM --binSize 1

force_assembly_hub

force_assembly_hub:
    description: whether an assembly hub should be made even though the assembly is supported by ucsc. Sometimes e.g. chromosomes have different names (chr1 vs 1) which makes the trackhub not display the data properly. Forcing an assembly hub can help in these cases.
    default: False
    type: boolean