Commit 3588290c authored by Koehorst, Jasper's avatar Koehorst, Jasper
Browse files

initial sync with additional code base for conversion of gff, interpro and eggnog

parent 76d8ce04
buildscript {
repositories {
maven { url 'https://repo.gradle.org/gradle/libs-releases' }
jcenter()
maven { url "https://plugins.gradle.org/m2/" }
}
dependencies {
// Jar
classpath 'com.github.jengelman.gradle.plugins:shadow:4.0.4'
// Auto increment version
classpath "gradle.plugin.com.zoltu.gradle.plugin:git-versioning:3.0.3"
// GIT
classpath "org.ajoberstar:grgit:1.7.2"
classpath "org.gradle:gradle-tooling-api:3.4.1"
}
}
apply plugin: 'maven'
apply plugin: 'java-library'
// Jar
apply plugin: "com.github.johnrengelman.shadow"
// Auto increment version
apply plugin: "com.zoltu.git-versioning"
apply plugin: 'java'
sourceCompatibility = 1.8
targetCompatibility = 1.8
repositories {
mavenCentral()
mavenLocal()
jcenter()
}
shadowJar {
baseName = 'MGnifyParser'
classifier = null
version = null
configurations = [project.configurations.runtimeClasspath]
}
jar {
manifest {
attributes 'Main-Class': 'nl.munlock.main.App'
}
}
artifacts {
archives shadowJar
}
group = 'nl.wur.ssb'
version = '0.1.0'
description = """MGnify module for conversion"""
dependencies {
compile group: 'nl.wur.ssb', name: 'Conversion', version: '1.0.0'
compile group: 'uk.ac.ebi.ena.sequence', name: 'embl-api-ff', version: '1.1.147'
compile group: 'com.google.code.gson', name: 'gson', version: '2.7'
compile group: 'com.beust', name: 'jcommander', version: '1.64'
compile group: 'org.biojava', name: 'biojava-core', version: '4.0.0'
compile group: 'com.googlecode.json-simple', name: 'json-simple', version:'1.1.1'
testCompile group: 'junit', name: 'junit', version: '3.8.1'
}
\ No newline at end of file
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
rootProject.name = 'MGnifyParser'
package nl.munlock.eggnog;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.beust.jcommander.Parameters;
import life.gbol.domain.AnnotationResult;
import nl.wur.ssb.SappGeneric.CommandOptionsGeneric;
import nl.wur.ssb.SappGeneric.ImportProv;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import java.time.LocalDateTime;
import java.util.Arrays;
import java.util.UUID;
@Parameters(commandDescription = "Available options: ")
public class CommandOptions extends CommandOptionsGeneric {
private static final Logger logger = Logger.getLogger(CommandOptions.class);
public String annotResultIRI;
@Parameter(names = {"-t", "-tool"}, description = "Which conversion tool to use, (conversion, interpro, eggnog, antismash)")
String tool;
@Parameter(names = {"-tsv"}, description = "MGnify TSV eggnog file")
String eggnogFile;
@Parameter(names = {"-v", "-version"}, description = "Version of eggnog used (name should match folder name in the database folder)")
String toolversion;
private String commandLine;
private final String repository = "https://gitlab.com/sapp/annotation/eggnog";
final String description = "Eggnog annotation module of SAPP";
public AnnotationResult annotResult;
@Parameter(names = "-starttime", description = "Start time of code", hidden = true)
private LocalDateTime starttime = LocalDateTime.now();
// TODO this is the default CommandOptions code ...
public CommandOptions(String args[]) throws Exception {
try {
new JCommander(this, args);
this.commandLine = StringUtils.join(args, " ");
if (this.help || args.length == 0)
throw new ParameterException("");
String[] files = new String[0];
if (this.input != null) {
logger.info("Loading data from file");
files = new String[]{this.input.get(0)};
this.domain = nl.wur.ssb.SappGeneric.InputOutput.Input.load(this.input);
}
if (this.input != null) {
files = new String[]{this.input.get(0)};
}
ImportProv origin = new ImportProv(domain, Arrays.asList(files), this.output, this.commandLine, this.tool, this.toolversion, this.repository, null, this.starttime, LocalDateTime.now());
annotResult = domain.make(AnnotationResult.class, "http://gbol.life/0.1/" + UUID.randomUUID());
annotResultIRI = annotResult.getResource().getURI();
origin.linkEntity(annotResult);
} catch (ParameterException pe) {
int exitCode = 64;
if (this.help) {
exitCode = 0;
}
System.out.println(pe.getMessage());
new JCommander(this).usage();
System.out.println(" * required parameter");
System.exit(exitCode);
}
}
}
package nl.munlock.eggnog;
import life.gbol.domain.*;
import nl.munlock.eggnog.CommandOptions;
import nl.wur.ssb.RDFSimpleCon.ResultLine;
import nl.wur.ssb.SappGeneric.GBOL.SequenceBuilder;
import nl.wur.ssb.SappGeneric.Generic;
import nl.wur.ssb.SappGeneric.Xrefs;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.purl.ontology.bibo.domain.Document;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.UUID;
public class Eggnog extends SequenceBuilder {
private XRefProvenance xrefProv;
private CommandOptions arguments;
private File name;
private final Logger logger = Logger.getLogger(Eggnog.class);
final UUID xrefprovID = UUID.randomUUID();
public Eggnog(String[] args) throws Exception {
super(null, "http://gbol.life/0.1/");
arguments = new CommandOptions(args);
this.domain = arguments.domain;
// store of the final output...
final File output = arguments.output;
name = new File(arguments.input.get(0));
this.rootIRI = "http://gbol.life/0.1/" + xrefprovID + "/";
xrefProv = arguments.domain.make(XRefProvenance.class, rootIRI + "XRefProv");
// xrefProv.setOrigin(arguments.annotResult);
Generic.Logger(arguments.debug);
// If no sequences are found the analysis is obviously skipped
// Everything is now from loadRDF as we have set a limit to 5.000 per
// analysis run...
parser(new File(arguments.eggnogFile));
logger.info("Results are saved in: " + arguments.output.getAbsolutePath());
nl.wur.ssb.SappGeneric.InputOutput.Output.save(arguments.domain, arguments.output);
arguments.domain.close();
}
private void parser(File file) throws Exception {
logger.debug("Parsing the MGnify TSV file");
// Obtain protein sequences and calculate MD5
Iterator<ResultLine> resultLines = domain.getRDFSimpleCon().runQuery("getProteins.txt", true).iterator();
HashMap<String, String> locusLookup = new HashMap<>();
while (resultLines.hasNext()) {
ResultLine resultLine = resultLines.next();
String iri = resultLine.getIRI("protein");
String locus = resultLine.getLitString("locus");
locusLookup.put(locus.toUpperCase(), iri);
}
Scanner scanner = new Scanner(file);
int annotationParsed = 0;
while (scanner.hasNext()) {
annotationParsed++;
if (annotationParsed % 100 == 0)
System.out.print("Annotations parsed: " + annotationParsed + "\r");
String line = scanner.nextLine();
if (line.startsWith("#"))
continue;
String[] lineArray = line.split("\t");
//#query_name
String query_name = lineArray[0]; // done
String seed_eggNOG_ortholog = lineArray[1]; // done
String seed_ortholog_evalue = lineArray[2]; // done
String seed_ortholog_score = lineArray[3]; // done
String best_tax_level = lineArray[4];
String Preferred_name = lineArray[5];
String GOs = lineArray[6]; // xref
String EC = lineArray[7]; // xref
String KEGG_ko = lineArray[8]; // xref
String KEGG_Pathway = lineArray[9]; //xref
String KEGG_Module = lineArray[10]; //xref
String KEGG_Reaction = lineArray[11]; //xref
String KEGG_rclass = lineArray[12]; //xref
String BRITE = lineArray[13]; //xref
String KEGG_TC = lineArray[14]; //xref
String CAZy = lineArray[15]; //xref
String BiGG_Reaction = lineArray[16]; //xref
String proteinURI = locusLookup.get(query_name);
if (proteinURI == null) {
throw new Exception("Null found!? for " + file);
}
Protein proteinRDF = arguments.domain.make(life.gbol.domain.Protein.class, proteinURI);
// System.err.println(StringUtils.join(lineArray, "\n"));
// Convert to RDF! - protein URI + seed_eggNOG_ortholog identifier
ProteinDomain proteinDomain = arguments.domain.make(ProteinDomain.class, proteinRDF.getResource().getURI() + "/" + seed_eggNOG_ortholog);
// Annotation result
AnnotationResult annotationResult = arguments.domain.make(AnnotationResult.class, arguments.annotResultIRI);
// Provenance
String version = "1.0-MGnify";
String library = "eggNOG";
life.gbol.domain.FeatureProvenance featureprov = arguments.domain.make(life.gbol.domain.FeatureProvenance.class, proteinDomain.getResource().getURI() + "/" + library + "/" + version);
nl.systemsbiology.semantics.sapp.domain.InterProScan provannot = arguments.domain.make(nl.systemsbiology.semantics.sapp.domain.InterProScan.class, proteinDomain.getResource().getURI() + "/" + library + "/" + version + "/prov"); // scoretry {
if (seed_ortholog_evalue.contains(";")) {
provannot.setEvalue(Double.valueOf(seed_ortholog_evalue.split(";")[1]));
} else {
provannot.setEvalue(Double.valueOf(seed_ortholog_evalue));
}
if (seed_ortholog_score.contains(";"))
provannot.setScore(Double.valueOf(seed_ortholog_score.split(";")[1]));
else {
provannot.setScore(Double.valueOf(seed_ortholog_score));
}
Document document = domain.make(Document.class, "https://blabla.com");
document.setTitle("EGNOGG BLA");
document.setAbstract("ABSTRACT HERE");
// document.setDateAccepted(LocalDate.parse("2020-11-06").atStartOfDay());
document.setDoi("DOI HERE");
provannot.setReference(document);
featureprov.setAnnotation(provannot);
featureprov.setOrigin(annotationResult);
proteinDomain.addProvenance(featureprov);
XRefProvenance xrefprov = arguments.domain.make(life.gbol.domain.XRefProvenance.class, featureprov.getResource().getURI() + "/xrefprov");
xrefprov.setOrigin(annotationResult);
for (String go_annotation : GOs.split(",")) {
if (go_annotation.length() == 0) continue;
XRef xref = Xrefs.create(XRef.class, arguments.domain, xrefprov,