Commit ef1027e7 authored by Haarst, Jan van's avatar Haarst, Jan van
Browse files

First import of script to extract scansize information from a BNX file.

parents
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 27 13:27:16 2015
@author: traca001
"""
import os
import sys
import json
import fileinput
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
debug = ''
runsizes = dict()
def parse_bnx(file_path, size_dict = {}):
for line in fileinput.input():
if line.startswith('# Run '):
if debug : print line.strip()
line = line.split('\t')
# The SourceFolder of this Run is the 2nd entry of the line
if debug : SourceFolder = line[1]
print SourceFolder
# The RunId of this Run is the last entry of the line
RunId = int(line[-1])
if debug : print RunId
runsizes[RunId]=dict()
# The NumberofScans of this Run is the 8th entry of the line
NumberofScans = int(line[7])
for Scan in range(1, NumberofScans + 1):
runsizes[RunId][Scan] = 0
size_dict[Scan] = 0
if debug : print runsizes
if debug : print size_dict
elif line.startswith('0\t'):
line = line.split('\t')
if debug : print line
# Length is the 3rd entry
Length = float(line[2])
# ScanNumber is the 8th entry
ScanNumber = int(line[7])
# RunId is the second to last entry
RunId = int(line[-2])
if debug: print ScanNumber, Length
runsizes[RunId][ScanNumber] += Length
size_dict[ScanNumber] += Length
if debug : print runsizes
if debug : print size_dict
return size_dict
def hist_plot(size_dict, values = []):
for n in size_dict:
values.append(size_dict[n])
plt.plot(values)
plt.show()
if __name__ == '__main__':
parse_bnx(sys.argv[1])
# print size_dict
print json.dumps(runsizes,sort_keys=True,indent=4, separators=(',', ': '))
labels = runsizes.keys() # 1,2,..
# hist_plot(size_dict)
"""
Example data :
# BNX File Version: 1.2
# Label Channels: 1
# Nickase Recognition Site 1:
# Min Molecule Length (Kb): 150
# Label SNR Filter Type: Static
# Min Label SNR: 2.750
# Software Version: 2.1.0.5973
#rh SourceFolder InstrumentSerial Time NanoChannelPixelsPerScan StretchFactor BasesPerPixel NumberofScans ChipId Flowcell LabelSNRFilterType MinMoleculeLength MinLabelSNR RunId
# Run data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-02_10_24\Detect Molecules ALPHAUNIT07 10/2/2014 10:24:31 AM 68819821 0.85 490.646636962891 12 20249,11887,8/27/2014,850015130 1 Dynamic 100 4.481689 1
# Run Data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-01_10_17\Detect Molecules ALPHAUNIT07 10/1/2014 10:17:14 AM 68819821 0.85 490.646636962891 3 20249,11887,8/27/2014,850015130 1 Dynamic 100 3.669297 2
# Run Data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-02_17_40\Detect Molecules ALPHAUNIT07 10/2/2014 5:40:56 PM 68819821 0.85 490.646636962891 30 20249,11887,8/27/2014,850015130 1 Dynamic 100 4.0552 3
# Run Data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-07_09_52\Detect Molecules ALPHAUNIT07 10/7/2014 9:52:50 AM 68819821 0.85 490.646636962891 30 20249,11887,8/27/2014,850015138 1 Dynamic 100 4.481689 4
# Run Data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-07_15_57\Detect Molecules ALPHAUNIT07 10/7/2014 3:57:36 PM 68819821 0.85 490.646636962891 30 20249,11887,8/27/2014,850015138 1 Dynamic 100 4.481689 5
# Run Data Z:\Labdata\AlphaUnit_07\2014-10\92544_tomato_nuclei_2014-10-08_18_51\Detect Molecules ALPHAUNIT07 10/8/2014 6:51:47 PM 68819821 0.85 490.646636962891 30 20249,11897,9/19/2014,850015323 1 Dynamic 100 4.953032 6
# Run Data Z:\Labdata\AlphaUnit_04\2014-10\92544_tomato_nuclei_2014-10-09_16_33\Detect Molecules ALPHAUNIT04 10/9/2014 4:33:42 PM 68819821 0.85 490.646636962891 30 20249,11897,9/19/2014,850015349 1 Static 0 0 7
# Run Data foo\swap92571_tomato_nuclei_2014-10-23_17_27\bar B023 10/23/2014 5:27:39 PM 69207771 0.85 560.93212890625 30 20249,11887,8/27/2014,850015110 1 Static 0 0 8
# Run Data foo\swap92571_tomato_nuclei_2014-10-23_18_22\bar B023 10/23/2014 6:22:45 PM 69207771 0.85 560.93212890625 30 20249,11887,8/27/2014,850015110 2 Static 0 0 9
# Quality Score QX01: SNR
# Quality Score QX02: Ave Intensity
#0h LabelChannel MoleculeId Length AvgIntensity SNR NumberofLabels OriginalMoleculeId ScanNumber ScanDirection ChipId Flowcell RunId GlobalScanNumber
#0f int int float float float int int int int string int int int
#1h LabelChannel LabelPositions[N]
#1f int float
#2h LabelChannel LabelPositions[N]
#2h int float
#Qh QualityScoreID QualityScores[N]
#Qf str float
0 3 286801.4 0.177417 42.720 23 3 1 -1 20249,11887,8/27/2014,850015130 1 1 1
1 12681.6 34673.8 41179.9 44849.6 53791.5 67825.2 69610.3 77348.8 82893.8 88369.7 106001.2 112227.1 133172.5 141298.1 161661.7 169396.6 180201.0 212370.5 221516.8 245870.2 248775.1 276038.5 284017.0 286801.4
QX11 37.3425 33.6353 10.9667 29.1389 18.0582 30.1201 19.3122 13.6342 25.4127 27.4380 40.7450 22.8699 29.0980 11.4947 57.1931 8.3590 64.5878 23.0364 52.9536 18.7634 58.5257 22.7799 31.2551
QX12 0.0841 0.0816 0.0458 0.0708 0.0555 0.1042 0.0725 0.0519 0.0638 0.0674 0.1016 0.0647 0.0727 0.0408 0.1263 0.0393 0.1502 0.0704 0.1481 0.0764 0.1518 0.0577 0.0916
0 4 209794.0 0.117181 20.013 13 4 1 -1 20249,11887,8/27/2014,850015130 1 1 1
1 11480.0 13296.0 17692.7 91294.8 100287.0 102576.2 124496.7 142540.4 143437.5 173266.5 178092.0 201619.3 206410.0 209794.0
QX11 8.3894 32.0272 16.9435 70.4016 42.7193 4.9234 29.2025 16.6782 25.7460 16.2417 24.1204 42.1043 8.1092
QX12 0.0467 0.1062 0.0561 0.1684 0.1221 0.0425 0.0632 0.0575 0.0709 0.0515 0.0655 0.1128 0.0391
"""
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment