Merge branch 'dev_Jules' into 'master'

V1.24.0 > V1.25.0 See merge request !25

Merge branch 'dev_Jules' into 'master'
V1.24.0 > V1.25.0 See merge request !25
9cd7b92a · Jules Sabban · 67168dc8 · a954a495 · 9cd7b92a · 9cd7b92a
Commit 9cd7b92a authored 4 months ago by Jules Sabban
--- a/assets/begin_template.txt
+++ b/assets/begin_template.txt
@@ -6,7 +6,7 @@
 ==================================================
 ----------------------------------------------------------------------------------

-NextFlow Run Name : $wfRunName  
+NextFlow Run Name : $runName  

 Demultiplexing is over, the analysis started at $dateStart.


--- a/assets/error_email_template.txt
+++ b/assets/error_email_template.txt
+----------------------------------------------------------------------------------
+==================================================
+------------------------------- get-nf workflow ----------------------------
+	         S H O R T  R E A D S  - N F   P I P E L I N E    
+     			    V$version
+==================================================
+----------------------------------------------------------------------------------
+
+NextFlow Run Name : $runName
+Project : $project
+
+An error occured during the analysis pipeline run.
+The pipeline is not completed ! Some analysis or files may be not on NGL-Bi.
+
+The command used to launch the workflow was as follows :
+
+  $commandLine
+  
+The pipeline was launched from :
+
+  $launchDir
+  
+The error message is :
+
+  $errorMessage
+  
+  $errorReport
+
+
+---
+$name
+$homePage
--- a/conf/base.config
+++ b/conf/base.config
@@ -341,7 +341,7 @@ process {
 		module = toolsModuleHash['SEQTK']
 	}

-	withName: ADD_MULTIQC {
+	withName: ADD_REPORT {
 		errorStrategy = 'ignore'
 	}


--- a/conf/report.config
+++ b/conf/report.config
@@ -29,5 +29,5 @@ manifest {
 	description = "Workflow for Illumina data quality control"
 	mainScript = 'main.nf'
 	nextflowVersion = '>=0.32.0'
-	version = '1.24.0'
+	version = '1.25.0'
 }
\ No newline at end of file
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -88,9 +88,10 @@ _Default_ : null

 ### Optionnal parameters
 Some other parameters are only for tracability and have no effect on analysis, there are :  
- **`--outdir_prefix`** [str]  
-This value will be a part of the name of the output directory. The real output directory is constructed as follow : `${inputdir}/nextflow/${outdir_prefix}_${nf_uniqueness}`. Where `${nf_uniqueness}` is the current date.  
-_Default_ : `${project}_${run_name}`
+- **`--outdir`** [str]  
+Path to the output directory. The real output directory is constructed as follow : `${params.inputdir}/nextflow/${params.project}/${params.run_name}_${nf_uniqueness}` if all these values are set or `${launchDir}/results_${nf_uniqueness}`. Where `${nf_uniqueness}` is the current date.  
+This parameter is not intended to be set manually.  
+_Default_ : `${launchDir}/results_${nf_uniqueness}`

 - **`--machine_id`** [str]  
 The machine identifier, such as `A00318` or `AV232702`.  
@@ -120,10 +121,6 @@ _Default_ : null
 The nG6 like description of the analysis.  
 _Default_ : null

- **`--merge_lanes`** [bool]  
-Merge fastq over the two lanes in CORE pipeline.   
-_Default_ : false
-
 ### Skipping parameters
 There are some availlable flags can be set to not run some parts of the pipeline.  
 - **`--no_subset`** [bool]  

--- a/lib/pipeline.groovy
+++ b/lib/pipeline.groovy
+/*
+*   SHORT READS PIPELINE FUNCTIONS
+*/
+
+// ----------------------------------
+//              Includes    
+// ----------------------------------
+import java.text.SimpleDateFormat
+
+include {
+	helpMessage;
+	printOptions;
+	paramsValidation;
+    customMailSend;
+    sendFinalMail;
+    get_workflow_info;
+    createSummary;
+} from "${params.shared_modules}/lib/utils.groovy"
+
+
+// ----------------------------------
+//        Variables Definition    
+// ----------------------------------
+SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss")
+
+pipeline_info = workflow.manifest.name.split('/')
+pipeline_group = pipeline_info[0]
+pipeline_project = pipeline_info[1]
+pipeline_techno = pipeline_project.split('-')[1]
+
+if (params.email == null) {
+    email_main = params.email_bioinfo
+} else {
+    email_main = params.email
+}
+
+emails_map = [
+    main: email_main,
+    bioinfo: params.email_bioinfo,
+    labo: params.email_labo,
+    failure: params.email_on_fail,
+    dev: params.email_dev
+]
+
+pipeline_options_map = [
+    inputdir:           [default: '', optional: false, help: 'Path to the input directory [demultiplexing output directory]'],
+    samplesheet:        [default: '$params.inputdir/SampleSheet.csv', optional: true, help: 'Path to IEM SampleSheet. Only for Illumina analysis'],
+    outdir:             [default: launchDir + '/results' , optional: true, help: 'Path where results are stored. Do not modify it, its value is automatically set'],
+    project:            [default: '', optional: false, help: 'Project\'s name'],
+    project_hash:       [default: '', optional: true, help: 'Project\'s hash ID for NGL-Bi'],
+    select_samples:     [default: '', optional: true, help: 'Comma separated list of samples name. Each sample in this list must match exactly the beginning of the fastq file name in the project directory. If this option is empty, the workflow takes as input every fastq files in the director'],
+    sequencer:          [default: '', optional: false, help: 'Name of the sequencer [NovaSeq600, AVITI2, ...]'],
+    machine_id:         [default: '', optional: true, help: 'Serial Number or unique identifier of the sequencer'],
+    fc_id:              [default: '', optional: true, help: '[metadata] Identifier of the Flowcell used'],   // useless ??
+    fc_type:            [default: '', optional: true, help: '[metadata] Type of the Flowcell'],    // useless ??
+    lane:               [default: '', optional: false, help: 'Number of the lane'],
+    data_nature:        [default: '', optional: true, help: 'ENA library strategy [AMPLICON, 16S, WGS, ...]. It will be use to select analysis to perform'],
+    species:            [default: '', optional: true, help: '[metadata] Scientific name of the species'],
+    is_multiplex:       [default: false, optional: true, help: 'true if several samples were sequenced on the same lane'],
+    run_name:           [default: '', optional: false, help: 'Human readable identifier of the analysis'],
+    run_date:           [default: '', optional: false, help: 'Use for file renamming. Format : DD/MM/YYYY or YYYYMMDD'],
+    description:        [default: '', optional: true, help: 'NG6 description of the run'],     // useless ??
+    fastp_n_reads:      [default: 100000000, optional: true, help: 'Number of reads to process using fastp'],
+    no_subset:          [default: false, optional: true, help: 'Is a subset of reads in fastq must be done before QC'],
+    large_sampling_threshold: [default: 200, optional: true, help: 'Number of samples from which we consider that the sequencing is highly multiplexed'],
+    miseq_subset_seq:   [default: '50000', optional: true, help: 'Number of reads to subsampling on MiSeq run'],
+    nova_subset_seq:    [default: '50000000', optional: true, help: 'Number of reads to subsampling on NovaSeq run'],
+    large_indexing_nova_subset_seq: [default: '500000', optional: true, help: 'Number of reads to subsampling on highly multiplexed NovaSeq run'],
+    aviti_subset_seq:   [default: '50000000', optional: true, help: 'Number of reads to subsampling on Aviti run'],
+    //depth: [default: 0, optional: true, help: 'In subsampling, number of X to keep'],     // Not use for the moment
+    reference_genome:   [default: '', optional: true, help: 'Path to the genome FASTA file'],
+    reference_transcriptome: [default: '', optional: true, help: 'Path to the transcriptome FASTA file'],
+    make_star_index:    [default: false, optional: true, help: 'Is the FASTA file must be indexed by star'],
+    sortmerna_db_path:  [default: '/work/project/PlaGe/sortemerna_db', optional: true, help: 'Path to the directory where the sortmerna databases are'],
+    min_overlap:        [default: 20, optional: true, help: 'For join pair step, minimum overlapping value [-m Flash option]'],
+    max_overlap:        [default: 55, optional: true, help: 'For join pair step, maximum overlapping value [-M Flash option]'],
+    max_mismatch_density: [default: 0.1, optional: true, help: 'For join pair step, maximum mismatch density [-x Flash option]'],
+    assignation_databank: [default: '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/ncbi_16S/240319_release/16SMicrobial', optional: true, help: 'Path to 16S Microbial database'],
+    blast_outfmt:       [default: 7, optional: true, help: 'Format of output for BLASTn'],
+    blast_max_target:   [default: 10, optional: true, help: 'Maximum number of hits from BLASTn'],
+    single_cell:        [default: false, optional: true, help: 'true is library was build with 10X kit'],
+    puc19:              [default: '', optional: true, help: 'Path to the puC19 fasta for MethylSeq analysis'],
+    lambda:             [default: '', optional: true, help: 'Path to the lambda fasta for MethylSeq analysis'],
+    ngl_bi_client:      [default: '/home/sbsuser/save/scripts-ngs/shared_modules_Current', optional: true, help: 'Path to NGL-Bi_Client sources'],
+    insert_to_ngl:      [default: true, optional: false, help: 'Should the data be stored on NGL ?'],
+    bi_run_code:        [default: '', optional: true, help: 'NGL-Bi Run Code'],
+    sq_xp_code:         [default: '', optional: true, help: 'NGL-SQ Experiment Code'],
+    ng6_name:           [default: true, optional: false, help: 'Is files must be nammed using ng6 standard'],
+    shared_modules:     [default: '/home/sbsuser/save/scripts-ngs/shared_modules_Current', optional: false, help: 'Path to Shared_modules sources'],
+    max_memory:         [default: '500.GB', optional: false, help: 'Maximum amount of memory that can be used to launch a sbatch job'],
+    max_time:           [default: '90.d', optional: false, help: 'Maximum amount of time that can be used to launch a sbatch job'],
+    max_cpus:           [default: '48', optional: false, help: 'Maximum number of CPUs that can be used to launch a sbatch job'],
+    default_label:      [default: 'Pipeline', optional: false, help: 'Default label for MultiQC'],
+    read_stats_label:   [default: 'ReadStats', optional: false, help: 'Read Stats label for MultiQC'],
+    duplicats_label:    [default: 'Duplicats', optional: false, help: 'Duplicats label for MultiQC'],
+    contamination_search_label: [default: 'ContaminationSearch', optional: false, help: 'Contamination Search label for MultiQC'],
+    join_pairs_label:   [default: 'JoinPairs', optional: false, help: 'Join Pairs label for MultiQC'],
+    alignment_stats_label: [default: 'AlignmentStats', optional: false, help: 'Alignment Stats label for MultiQC'],
+    cluster_options:    [default: '', optional: true, help: 'Sbatch options to pass to each job'],
+    is_dev_mode:        [default: false, optional: false, help: 'Preset of some options'],
+    DTM_mode:           [default: false, optional: false, help: 'Enable some process for DTM analysis'],
+    email:              [default: '', optional: false, help: 'Main email adress for execution pipeline notifications'],
+    email_on_fail:      [default: 'jules.sabban@inrae.fr', optional: false, help: 'Email adress to notify execution pipeline errors'],
+    email_bioinfo:      [default: 'get-plage.bioinfo@genotoul.fr', optional: true, help: 'Bioinformatics team email adress for execution pipeline notifications'],
+    email_labo:         [default: '', optional: true, help: 'Biologists team email adress for execution pipeline notifications'],
+    host:               [default: 'genobioinfo', optional: false, help: 'Name of the HPC where the pipeline is executed. Must have special config file in conf folder'],
+    skip_core_illumina: [default: false, optional: false, help: 'To skip Illumina subworkflow'],
+    skip_core_element:  [default: false, optional: false, help: 'To skip Elembio subworkflow'],
+    help:               [default: false, optional: true, help: 'To print help message']
+]
+
+begin_email_fields = get_workflow_info(
+    [
+        subject_prefix: "[${params.sequencer}]",
+        subject_sufix: params.inputdir.split('/')[-1],
+        // version: workflow.manifest.version,
+        // wfRunName: workflow.runName,
+        run_name: params.run_name,
+        runNGLBi: (params.bi_run_code ?: ''),
+        xpNGLSq: (params.sq_xp_code ?: ''),
+        project: params.project,
+        sequencer: params.sequencer,
+        flowcell: params.fc_id,
+        lane: params.lane,
+        data_nature: params.data_nature,
+        directory: params.inputdir,
+        // commandLine: workflow.commandLine,
+        dateStart: format.format(new Date()),
+    ]
+)
+
+// ----------------------------------
+//        Functions Definition    
+// ----------------------------------
+def create_final_email_fields(formatted_date, summary) {
+    return get_workflow_info(
+        [
+            subject_prefix: "[${params.sequencer}]",
+            subject_sufix: params.inputdir.split('/')[-1],
+            project: (params.project ?: ''),
+            run: (params.run_name ?: ''),
+            runNGLBi: (params.bi_run_code ?: ''),
+            xpNGLSq: (params.sq_xp_code ?: ''),
+            dateComplete: formatted_date,
+            summary: (summary ?: [:])
+        ]
+    )
+}
+
+def create_error_email_fields(formatted_date) {
+    return get_workflow_info(
+        [
+            subject_prefix: "[${params.sequencer}]",
+            subject_sufix:  params.inputdir.split('/')[-1] + " : ERROR",
+            project:        params.project,
+        ]
+    )
+}
+
+def endOfPipelineEvents(summary) {
+    SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss")
+    end_mail_sent = false
+
+    def email_address = emails_map.main
+    def email_cc =  emails_map.bioinfo
+
+	if (emails_map.main && emails_map.failure && !workflow.success) {
+		email_address = emails_map.failure
+        email_cc = ''
+	}
+    if (params.is_dev_mode) {
+        email_address = emails_map.dev
+        email = ''
+    }
+
+    workflow.onComplete {
+        log.info "Sending final e-mail"
+        template_final = "$baseDir/assets/final_email_template.txt"
+        final_email_fields = create_final_email_fields(format.format(new Date()), summary)  
+        end_mail_sent = sendFinalMail(template_final, final_email_fields, email_address, email_cc, end_mail_sent)
+
+        // remove work directory if pipeline is successful
+        if (workflow.success) {
+            if (!workflow.profile.contains('dev') ) {
+            println "Pipeline terminé avec succès => suppression du workdir : $workflow.workDir"
+            exec:
+                workflow.workDir.deleteDir()
+            }
+
+            if (workflow.stats.ignoredCount > 0) {
+                log.warn "Warning, pipeline completed, but with errored process(es) "
+                log.warn "Number of ignored errored process(es) : ${workflow.stats.ignoredCount} "
+                log.warn "Number of successfully ran process(es) : ${workflow.stats.succeedCount} "
+            }
+
+            log.info "[$workflow.manifest.name] Pipeline completed successfully at $workflow.complete"
+            
+        } else {
+            log.error "[$workflow.manifest.name] Pipeline completed with errors at $workflow.complete"
+        }    
+    }
+
+    workflow.onError {
+        error_email_fields = create_error_email_fields(format.format(new Date())) 
+        template_error = "$baseDir/assets/error_email_template.txt"
+        log.info "Sending error e-mail"
+        end_mail_sent = sendFinalMail(template_error, error_email_fields, email_address, email_cc, end_mail_sent)
+    }
+}
+
+def getSummary() {
+    SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss")
+    return createSummary(format.format(new Date()))
+}
+
+// ----------------------------------
+//          Auto Execution    
+// ----------------------------------
+// Show help message
+if (params.help) {
+    helpMessage(pipeline_options_map, 'S H O R T  R E A D S - N F   P I P E L I N E')
+    exit 0
+}
+
+// Parameter validation
+if (paramsValidation(pipeline_options_map)) { 	// true s'il manque 1 param
+	log.error "\t-> Il manque au moins un paramètre obligatoire."
+	exit 0
+} else {
+	log.info "\t-> OK"
+	printOptions(pipeline_options_map)
+}
+
+// Email on start
+customMailSend(
+    "$baseDir/assets/begin_template.txt",
+    begin_email_fields,
+    emails_map.main,
+    "${emails_map.labo},${emails_map.bioinfo}",
+    !workflow.resume,
+    false
+)
+
--- a/main.nf
+++ b/main.nf
@@ -20,11 +20,6 @@ This script is based on :
 - the Curie institute template https://github.com/bioinfo-pf-curie/geniac-template/

 */
-import java.text.SimpleDateFormat
-SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss")
-include {createSummary} from "$baseDir/conf/functions.config"
-params.summary = createSummary(format.format(new Date()))
-params.summary.collect{k,v -> println "$k : $v"}


 /*

--- a/nextflow.config
+++ b/nextflow.config
@@ -4,15 +4,14 @@
 params {
 	// ----- GLOBAL PARAMETERS -----
 	inputdir = ""
-	outdir_prefix = ""
 	project = ""
+	project_hash = ""
 	select_samples = ""
 	sequencer = ""
 	machine_id = ""
 	fc_id = ""
 	fc_type = ""
 	lane = ""
-	merge_lanes = false

 	data_nature = ""	// AMPLICON, 16S, WGS, TRANSCRIPTOMIC, ...  //
 	species = ""
@@ -34,8 +33,6 @@ params {
 	aviti_subset_seq = "50000000"							// in reads 
 	large_indexing_nova_subset_seq = "500000"				// in reads
 	coverage = 0											// coverage in X
-	len_r1 = 150
-	len_r2 = 150

 	// DNA / RNA params
 	reference_genome = ""
@@ -105,7 +102,8 @@ params {
 //=========================================
 import java.nio.file.Files
 import java.nio.file.Paths
-def n_read_files = Files.walk(Paths.get(params.inputdir.toString()), 3)
+def samples_max_depth = params.sequencer == 'AVITI' ? 4 : 3
+def n_read_files = Files.walk(Paths.get(params.inputdir.toString()), samples_max_depth)
 	.filter(Files::isRegularFile)
 	.filter(p -> p.getFileName().toString().matches(".*_L00${params.lane}_R[12](_.*)?\\.fastq\\.gz"))
 	.count()
@@ -116,16 +114,20 @@ System.out.println "\nNombre d'échantillons trouvés sur cette lane : $n_sample
 def factor = java.lang.Math.round(0.1 * n_samples)
 params.resource_factor =  factor > 1 ? factor : 1

+// Dynamics params, depend on others
 import java.text.SimpleDateFormat
 SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyyMMddHHmmss")
+nf_uniqueness = uniqueness_format.format(new Date())
+if (params.inputdir != '' && params.project != '' && params.run_name != '') {
+	outdir_prefix = "${params.inputdir}/nextflow/${params.project}/${params.run_name}"
+} else {
+	outdir_prefix = "${launchDir}/results"
+}
+params.outdir = "${outdir_prefix}_${nf_uniqueness}"

-params {
-	// Dynamics params, depend on others
-	samplesheet = inputdir.toString() + "/SampleSheet.csv"
-	nf_uniqueness = uniqueness_format.format(new Date())
-	outdir_prefix = outdir_prefix ?: project + "_" + run_name
-	outdir = inputdir + "/nextflow/" + run_name + "/" + outdir_prefix + "_" + nf_uniqueness
+params.samplesheet = params.inputdir ? params.inputdir+ "/SampleSheet.csv" : ''	

+params {
 	subset_seq = miseq_subset_seq	
 	if ( sequencer =~ /NovaSeq.*/ || sequencer =~ /AVITI.*/ ) {
 		if ( n_samples >= large_sampling_threshold ) {
@@ -133,7 +135,7 @@ params {
 		}
 		subset_seq = nova_subset_seq
 	}
-	if ( DTM_mode ) {
+	if ( params.DTM_mode == true) {
 		subset_seq = "1000000000"
 	}
 }

--- a/workflow/short_reads_qc.nf
+++ b/workflow/short_reads_qc.nf
@@ -2,29 +2,9 @@

 nextflow.enable.dsl = 2

-// Import custom functions
-include {	helpMessage;
-			createSummary;
-			sendBeginMail;
-			sendFinalMail;
-			softwareVersionsToYAML
-} from "$baseDir/conf/functions.config"
-
-// Show help message
-if (params.help) {
-    helpMessage()
-    exit 0
-}
-
-// Print every non-void parameters
-System.out.println "\nAffichage de tous les paramètres non vides :"
-params.each{entry ->
-	if (entry.value != "") {
-		println "$entry.key:\t $entry.value"
-	}
-}
-System.out.println "\n"
-
+include { 	getSummary; 
+			endOfPipelineEvents 		} from "${baseDir}/lib/pipeline.groovy"
+include { 	softwareVersionsToYAML		} from "${params.shared_modules}/lib/utils.groovy"
 // -------------------------------------------------
 // 					CHANNELS
 // -------------------------------------------------
@@ -86,7 +66,8 @@ mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1
 //banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta

 createDir = file(params.outdir).mkdir()
-
+params.summary = getSummary()
+params.summary.collect{k,v -> println "$k : $v"}
 // -------------------------------------------------
 // 					INCLUDES
 // -------------------------------------------------
@@ -107,14 +88,6 @@ include {	MULTIQC		} from "${params.shared_modules}/multiqc.nf"
 include {	GCBIAS as GC_BIAS } from "${params.shared_modules}/gcbias.nf"
 include {	workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/workflow_summary.nf"

-
-// -------------------------------------------------
-// 					 EMAIL ON START
-// -------------------------------------------------
-import java.text.SimpleDateFormat
-SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss")
-sendBeginMail(format.format(new Date()))
-
 // -------------------------------------------------
 // 					WORKFLOW
 // -------------------------------------------------
@@ -231,19 +204,4 @@ workflow SHORT_READS_QC {
 	}
 }

-// -------------------------------------------------
-// 				EMAIL ON COMPLETE
-// -------------------------------------------------
-def end_mail_sent = false
-workflow.onComplete {
-	end_mail_sent = sendFinalMail(format.format(new Date()), params.summary)
-
-	// remove work directory if pipeline is successful
-	if (workflow.success && !( params.is_dev_mode ||  params.DTM_mode)) {
-		println "Pipeline terminé avec succès => suppression du workdir : $workflow.workDir"
-		exec:
-			workflow.workDir.deleteDir()
-	}
-}
-
-workflow.onError { }
\ No newline at end of file
+endOfPipelineEvents(params.summary)
\ No newline at end of file