Commit 9b4958f9 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

first setup for phyloseq / biom creation from a .job file created by the web interface

parent 05a60beb
# Ignore Gradle project-specific cache directory
.gradle
# Ignore Gradle build output directory
build
unlock
/*
* This file was generated by the Gradle 'init' task.
*
* This generated file contains a sample Java project to get you started.
* For more details take a look at the Java Quickstart chapter in the Gradle
* user guide available at https://docs.gradle.org/5.0/userguide/tutorial_java_projects.html
*/
buildscript {
repositories {
mavenCentral()
mavenLocal()
maven { url 'https://repo.gradle.org/gradle/libs-releases' }
maven { url "https://plugins.gradle.org/m2/" }
}
dependencies {
// Jar
classpath "com.github.jengelman.gradle.plugins:shadow:6.0.0"
}
}
plugins {
// Apply the java plugin to add support for Java
id 'java'
// Apply the application plugin to add support for building an application
id 'application'
}
apply plugin: "com.github.johnrengelman.shadow"
shadowJar {
baseName = 'Phyloseq'
classifier = null
version = null
configurations = [project.configurations.runtimeClasspath]
}
// Define the main class for the application
mainClassName = 'nl.munlock.App'
jar {
manifest {
attributes 'Main-Class': 'nl.munlock.App'
}
}
artifacts {
archives shadowJar
}
group = 'nl.munlock.phyloseq'
version = '0.1.0'
description = """Phyloseq object creator"""
repositories {
mavenLocal()
mavenCentral()
}
dependencies {
// // https://mvnrepository.com/artifact/com.google.guava/guava
// compile group: 'com.google.guava', name: 'guava', version: '27.1-jre'
// // This dependency is found on compile classpath of this component and consumers.
// implementation 'com.google.guava:guava:26.0-jre'
// Use JUnit test framework
testImplementation 'junit:junit:4.12'
// // https://mvnrepository.com/artifact/org.apache.poi/poi
// compile group: 'org.apache.poi', name: 'poi', version: '4.1.0'
// // https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml
// compile group: 'org.apache.poi', name: 'poi-ooxml', version: '4.1.0'
// // https://mvnrepository.com/artifact/com.github.samtools/htsjdk
// compile group: 'com.github.samtools', name: 'htsjdk', version: '2.19.0'
// // https://mvnrepository.com/artifact/com.jcraft/jsch
// compile group: 'com.jcraft', name: 'jsch', version: '0.1.55'
// // Kubernetes api
// compile 'io.kubernetes:client-java:5.0.0'
// // Logger
// compile group: 'log4j', name: 'log4j', version: '1.2.17'
// // https://mvnrepository.com/artifact/com.esotericsoftware.yamlbeans/yamlbeans
// compile group: 'com.esotericsoftware.yamlbeans', name: 'yamlbeans', version: '1.15'
// locally installed jargon and unlock api
implementation group: 'jargon', name: 'core', version: '4.3.0.2'
implementation group: 'nl.munlock', name: 'unlockapi', version: '1.0.1'
}
\ No newline at end of file
#!/bin/bash
#============================================================================
#title :IRODS Phyloseq
#description :IRODS Phyloseq installation script
#author :Jasper Koehorst
#date :2021
#version :0.0.1
#============================================================================
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
git -C $DIR pull
# ////////////////////////////////////////////////////////////////////////////////////
# Jargon dependency
# ////////////////////////////////////////////////////////////////////////////////////
wget -nc https://github.com/DICE-UNC/jargon/releases/download/4.3.0.2-RELEASE/jargon-core-4.3.0.2-RELEASE-jar-with-dependencies.jar -O $DIR/jargon-core-4.3.0.2-RELEASE-jar-with-dependencies.jar
mvn install:install-file -Dfile=$DIR/jargon-core-4.3.0.2-RELEASE-jar-with-dependencies.jar -DgroupId=jargon -DartifactId=core -Dversion=4.3.0.2 -Dpackaging=jar
# ////////////////////////////////////////////////////////////////////////////////////
# // UNLOCK API
# ////////////////////////////////////////////////////////////////////////////////////
wget -nc http://download.systemsbiology.nl/unlock/UnlockOntology.jar -O $DIR/UnlockOntology.jar
mvn install:install-file -Dfile=$DIR/UnlockOntology.jar -DgroupId=nl.munlock -DartifactId=unlockapi -Dversion=1.0.1 -Dpackaging=jar
# Building mode
if [ "$1" == "test" ]; then
gradle build -b "$DIR/build.gradle" --info
else
echo "Skipping tests, run './install.sh test' to perform tests"
gradle build -b "$DIR/build.gradle" -x test
fi
cp $DIR/build/libs/*jar $DIR/
\ No newline at end of file
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto execute
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto execute
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
/*
* This file was generated by the Gradle 'init' task.
*
* The settings file is used to specify which projects to include in your build.
*
* Detailed information about configuring a multi-project build in Gradle can be found
* in the user guide at https://docs.gradle.org/4.5.1/userguide/multi_project_builds.html
*/
rootProject.name = 'Phyloseq'
package nl.munlock;
import nl.munlock.irods.Connection;
import nl.munlock.options.irods.CommandOptions;
import nl.wur.ssb.RDFSimpleCon.api.Domain;
import org.apache.log4j.Logger;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Scanner;
import static nl.munlock.Phyloseq.*;
public class App {
/**
* Main landing point for yaml or kubernetes runs...
*
* @param args
* @throws Exception
*/
private static final Logger log = Generic.getLogger(App.class, false);
public static CommandOptions commandOptions;
public static Domain metadataDomain;
public static void main(String[] args) throws Exception {
commandOptions = new CommandOptions(args);
// Make the irods connection
Connection connection = new Connection(commandOptions);
// Downloading job files
Scanner scanner = new Scanner(new File(commandOptions.jobFile));
HashSet<String> hdts = new HashSet<>();
HashSet<String> turtles = new HashSet<>();
ArrayList<String> lines = new ArrayList<>();
// Load file in as array
while (scanner.hasNextLine()) {
String file = scanner.nextLine();
if (file.endsWith(".hdt")) {
hdts.add(file);
}
if (file.endsWith(".ttl")) {
turtles.add(file);
}
}
scanner.close();
// Turn all turtle files into a new domain object
metadataDomain = new Domain("");
for (String turtle : turtles) {
// Download
nl.munlock.irods.Generic.downloadFile(connection, new File(turtle));
// And validate
nl.munlock.irods.Generic.downloadFile(connection, new File(turtle));
log.info("Merging " + turtle);
Domain domain = new Domain("file://." + turtle);
metadataDomain.getRDFSimpleCon().getModel().add(domain.getRDFSimpleCon().getModel().listStatements());
domain.close();
}
// Generate objects for phyloseq or biom while downloading them. Redownload if job fails?
int counter = 0;
for (String hdt : hdts) {
// Download
nl.munlock.irods.Generic.downloadFile(connection, new File(hdt));
// And validate
nl.munlock.irods.Generic.downloadFile(connection, new File(hdt));
// Counter for process printing
counter = counter + 1;
if (counter % 100 == 0)
log.info("Processing " + counter + " " + hdt);
// Parse the hdt file to create phyloseq content
Phyloseq.generate(new File("." + hdt));
}
// shutting down all writers
logFileWriter.close();
metadataFileWriter.close();
domain.close();
asvFileWriter.close();
taxonFileWriter.close();
seqFileWriter.close();
// Check the metadata but should be ok now...
scanner = new Scanner(new File(commandOptions.prefix + "_metadata.tsv"));
HashMap<String, String> metadata = new HashMap<>();
HashMap<String, String> headers = new HashMap<>();
HashSet<String> content = new HashSet<>();
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] lineSplit = line.split("\t");
String key1 = lineSplit[0];
String key2 = lineSplit[1];
String value = lineSplit[2];
String keys = key1 + "\t" + key2;
headers.put(keys, value);
if (metadata.containsKey(keys)) {
if (!metadata.get(keys).equals(value)) {
// Often occurrence when sample dataset is used elsewhere
if (!keys.contains("logicalPath")) {
log.error("Duplication info found " + line);
log.error("Duplication info found " + keys + " " + metadata.get(keys));
}
}
} else {
metadata.put(keys, value);
}
}
}
}
package nl.munlock;
import nl.munlock.irods.Connection;
import org.apache.commons.io.FilenameUtils;
import org.apache.log4j.*;
import org.irods.jargon.core.checksum.ChecksumValue;
import org.irods.jargon.core.exception.JargonException;
import org.irods.jargon.core.pub.DataObjectChecksumUtilitiesAO;
import org.irods.jargon.core.pub.DataTransferOperations;
import org.irods.jargon.core.pub.io.IRODSFile;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Generic {
private static final org.slf4j.Logger log = LoggerFactory.getLogger(Generic.class);
static ArrayList<String> processed = new ArrayList<>();
/**
* Logger initialization with debug option
* @param debug boolean if debug mode should be enabled
*
* @return
*/
public static Logger getLogger(Class clazz, boolean debug) {
ConsoleAppender console = new ConsoleAppender();
String PATTERN = "%d %-5p [%c{1}] %m%n";
console.setLayout(new PatternLayout(PATTERN));
console.setThreshold(Level.DEBUG);
console.activateOptions();
Logger.getRootLogger().removeAllAppenders();
Logger.getRootLogger().addAppender(console);
Logger logger = Logger.getLogger(clazz);
logger.setLevel(Level.INFO);
if (debug)
logger.setLevel(Level.DEBUG);
FileAppender fa = new FileAppender();
fa.setName("iRODS Logger");
fa.setFile("runner.log");
fa.setLayout(new PatternLayout("%d %-5p [%c{1}] %m%n"));
fa.setThreshold(Level.INFO);
if (debug) fa.setThreshold(Level.DEBUG);
fa.setAppend(true);
fa.activateOptions();
org.apache.log4j.Logger.getRootLogger().addAppender(fa);
return logger;
}
public static String getAssayPath(File collection) throws FileNotFoundException {
String regex = "(.*/S_.*?/(Amplicon|RNA|DNA)/A_.*?)/";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(collection.getAbsolutePath() + "/");
while (matcher.find()) {
return matcher.group(1);
}
throw new FileNotFoundException("Assay path not found in " + collection);
}
public static void downloadFile(Connection connection, File download) throws JargonException {
IRODSFile irodsFile = connection.fileFactory.instanceIRODSFile(download.getAbsolutePath());
if (!irodsFile.exists()) {