Convert Chemstation D Files Eu

Function to import Agilent GCMS Chemstation D files in R

GCMSAgilentDfileImport

##' Function readDFile

##'

##' Function readDFile

##' @param pathname the pathname of the directory containing the data to import

##' @return outData Output is a matrix of ion counts with rows as scantime and

##' columns as mass, and the respective values as labels

##' @export

readDFile<-function(pathname){

filename<-file.path(pathname,'DATA.MS')

cat('Opening Agilent file...n')

to.read<-file(filename,'rb')

agilent<-readBin(to.read,integer(),size=1,signed=FALSE,n=20000000, endian='little')

close(to.read)

### preparing vector with counts per scan

cat('...extracting counts per scan...n')

countsNumber<-agilent[5782]

counts<-agilent[5782]

currentPosition<-5782

while(currentPosition<length(agilent)){

jumpLength<-countsNumber*4+7*4

currentPosition<-currentPosition+jumpLength

countsNumber<-agilent[currentPosition]

counts<-c(counts,agilent[currentPosition])

}

### counts will be too long and the last entry is NA

### counts will be corrected when the number of scans

### is known.

#counts<-countNumber(agilent)

### cut away NA.

counts<-counts[-which(is.na(counts))]

### the second period is extracted. This one is currently

### used to determine the number of scans. As the useful length

### is not known yet, it is extracted in overlength. Na's have

### to be removed then

cat('...determine number of scans...n')

secondPeriod<-agilent[seq(5772,2000000,4)]

### remove Na's

if(any(which(is.na(secondPeriod)))){

secondPeriod<-secondPeriod[-which(is.na(secondPeriod))]

}

### inline function to extract the data paddings

betweenSequence<-function(period,counts){

tempSequence<-period[1:4]

currentPosition<-4

for(ii in counts){

currentPosition<-(currentPosition+ii)

tempSequence<-c(tempSequence,period[currentPosition:(currentPosition+7)])

currentPosition<-(currentPosition+7)

}

return(tempSequence)

}

### the betweenSequence removes the variable length ion count

### data. Left are 8 numbers of padding betweeen each scan

betweenSecond<-betweenSequence(secondPeriod,counts)

### in the betweenSequence of the second Period, the 8th

### field is currently used for determination of scan numbers

### when it is 3 times in sequence zero,

numberOfScans<-which.max(abs(diff(betweenSecond[seq(1,100000,8)]))>1)

counts<-counts[1:numberOfScans]

rawExtractLength<-sum(counts)*4+(numberOfScans*4*7)+5770

### extract periods with correct length

cat('...separate rawdata in four sequences...n')

firstPeriod<-agilent[seq(5771,rawExtractLength,4)]

secondPeriod<-agilent[seq(5772,rawExtractLength,4)]

thirdPeriod<-agilent[seq(5773,rawExtractLength,4)]

fourthPeriod<-c(agilent[seq(5770,rawExtractLength,4)][-1],0)

## extract second, third and fourth between for the SCAN TIME

cat('...extracting scantimes...n')

betweenFirst<-betweenSequence(firstPeriod,counts)

betweenSecond<-betweenSequence(secondPeriod,counts)

betweenThird<-betweenSequence(thirdPeriod,counts)

betweenFourth<-betweenSequence(fourthPeriod,counts)

scanTime<-betweenFirst[seq(1,8*numberOfScans,8)]*16777216

+betweenSecond[seq(1,8*numberOfScans,8)]*65536

+betweenThird[seq(1,8*numberOfScans,8)]*256

+betweenFourth[seq(1,8*numberOfScans,8)]

scanTime<-round(scanTime/1000/60,4)

## extract main sequence, reverse them scan wise

mainSequence<-function(period,counts){

tempSequence<-NULL

currentPosition<-5

for(ii in counts){

tempSequence<-c(tempSequence,period[(currentPosition+ii-1):currentPosition])

currentPosition<-(currentPosition+ii+7)

}

return(tempSequence)

}

### extract main data for INT and MZ

cat('...extract intensity and Mz data...n')

mainFirst<-mainSequence(firstPeriod,counts)

mainSecond<-mainSequence(secondPeriod,counts)

mainThird<-mainSequence(thirdPeriod,counts)

mainFourth<-mainSequence(fourthPeriod,counts)

### calculate MZs. This will result in the *real*, used MZs. For the case that the

### detector is switched off, zero values are included.

importMz<-round(mainFirst*12.8+mainSecond*0.05)

### calculate intensity values

cat('...calculate intensity values...n')

mainFourth[which(floor(mainThird/64)3)]<-(mainFourth[which(floor(mainThird/64)3)]*512)

mainFourth[which(floor(mainThird/64)2)]<-(mainFourth[which(floor(mainThird/64)2)]*64)

mainFourth[which(floor(mainThird/64)1)]<-(mainFourth[which(floor(mainThird/64)1)]*8)

mainThird[which(floor(mainThird/64)3)]<-((mainThird[which(floor(mainThird/64)3)] %% 192)*131072)

mainThird[which(floor(mainThird/64)2)]<-((mainThird[which(floor(mainThird/64)2)] %% 128)*16384)

mainThird[which(floor(mainThird/64)1)]<-((mainThird[which(floor(mainThird/64)1)] %% 64)*2048)

mainThird[which(floor(mainThird/64)0)]<-(mainThird[which(floor(mainThird/64)0)]*256)

importInt<-(mainThird+mainFourth)

cat('...assembling data matrix...n')

DATA<-matrix(rep(0,numberOfScans*max(importMz)),numberOfScans,max(importMz))

position<-1

for(ii in 1:numberOfScans){

for(jj in counts[ii]){

if(counts[ii]){

DATA[ii,importMz[position:(position+jj-1)]]<-importInt[position:(position+jj-1)]

position<-position+jj

}else{

position<-position+1

}

rownames(DATA)<-scanTime

colnames(DATA)<-seq(1:dim(DATA)[2])

#### Output is a matrix of ion counts with rows as scantime and columns as mass,

#### and the respective values as labels

return(DATA)

}

# To use the function, setwd to the dir with the DATA.MS, then,

my_data <- readDFile(getwd())

abundance <- rowSums(df)

# but all scan times are zeros!

# As this doesn't get the scan times for me, my workaround is to use Openchrom to open the Agilent DATA.MS files (with the plug-in), then 'save-as' CSV. The CSVs then need to be cleaned further before plots can be made. Or we can convert files batchwise to xy files (they are actually txt files, see attached).

# Step 1:

# Menu bar>File>Import > select 'Convert MSD chromatograms to *.ocb format'

# Step 2:

# Menu bar>File>Import > select 'Convert FID, ECD... chromatograms to *.xy or *.ocb format'

# then look for the xy file, which is a plain text, two column file

# read in CSV files from openchrom (assuming we have a batch)

the_csv_files_from_openchrom <- list.files(pattern = '.csv$')

the_chrom_data <- lapply(the_csv_files_from_openchrom, read.csv2)

# convert all cols to numeric

asNumeric <- function(x) as.numeric(as.character(x))

factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],

asNumeric))

the_chrom_data <- lapply(the_chrom_data, function(i) factorsNumeric(i))

# get only 'abundance' and 'time' for plotting

abundance_time <- lapply(the_chrom_data, function(i) data.frame(time = i$RT.minutes....NOT.USED.BY.IMPORT, abundance = rowSums(i[, 4:ncol(i)]) ))

# add sample names to the data

names(abundance_time) <- gsub('.csv', ', the_csv_files_from_openchrom)

indices <- lapply(abundance_time, nrow)

sample_ID <- unlist(lapply(seq_along(names(abundance_time)), function(i) rep(names(abundance_time)[i], indices[i])))

abundance_time <- do.call(rbind.data.frame, abundance_time)

abundance_time$sample_ID <- sample_ID

# make a grid of plots

library(ggplot2)

ggplot(abundance_time, aes(time, abundance)) +

geom_line() +

xlab('time (min)') +

facet_wrap(~sample_ID, scales = 'free_y')

Hi, I have installed Proteowizard but am struggling convert files from Agilent's Chemstation to mzXML, even if I first export them to netCDF format in Chemstation. Please can you help me with this.

Mass spectrometry is a scientific technique for measuring the mass-to-charge ratio of ions. It is often coupled to chromatographic techniques such as gas- or liquid chromatography and has found widespread adoption in the fields of analytical chemistry and biochemistry where it can be used to identify and characterize small molecules and proteins (proteomics). The large volume of data produced in a typical mass spectrometry experiment requires that computers be used for data storage and processing. Over the years, different manufacturers of mass spectrometers have developed various proprietary data formats for handling such data which makes it difficult for academic scientists to directly manipulate their data. To address this limitation, several open, XML-based data formats have recently been developed by the Trans-Proteomic Pipeline at the Institute for Systems Biology to facilitate data manipulation and innovation in the public sector. These data formats are described here.

1Open formats
3Software

Open formats[edit]

JCAMP-DX[edit]

This format was one of the earliest attempts to supply a standardized file format for data exchange in mass spectrometry. JCAMP-DX was initially developed for infrared spectrometry. JCAMP-DX is an ASCII based format and therefore not very compact even though it includes standards for file compression. JCAMP was officially released in 1988.^[1] JCAMP was found impractical for today's large MS data sets, but it is still used for exchanging moderate numbers of spectra. IUPAC^[2] is currently in charge and the latest protocol is from 2005.^[3]

ANDI-MS or netCDF[edit]

The Analytical Data Interchange Format for Mass Spectrometry is a format for exchanging data. Many mass spectrometry software packages can read or write ANDI files. ANDI is specified in the ASTM E1947 Standard.^[4] ANDI is based on netCDF which is a software tool library for writing and reading data files. ANDI was initially developed for chromatography-MS data and therefore was not used in the proteomics gold rush where new formats based on XML were developed.

mzData[edit]

mzData was the first attempt by the Proteomics Standards Initiative (PSI) from the Human Proteome Organization (HUPO) to create a standardized format for Mass Spectrometry data.^[5] This format is now deprecated, and replaced by mzML.^[6]

mzXML[edit]

mzXML is a XML (eXtensible Markup Language) based common file format for proteomics mass spectrometric data.^[7]^[8] This format was developed at the Seattle Proteome Center/Institute for Systems Biology while the HUPO-PSI was trying to specify the standardized mzData format, and is still in use in the proteomics community.

mzML[edit]

As two formats (mzData and mzXML) for representing the same information is an undesirable state, a joint effort was set by HUPO-PSI, the SPC/ISB and instrument vendors to create a unified standard borrowing the best aspects of both mzData and mzXML, and intended to replace them. Originally called dataXML, it was officially announced as mzML.^[9] The first specification was published in June 2008.^[10] This format was officially released at the 2008 American Society for Mass Spectrometry Meeting, and is since then relatively stable with very few updates. On 1 June 2009, mzML 1.1.0 was released. There are no planned further changes as of 2013.

Proprietary formats[edit]

Below is a table of different file format extensions.

Company	Extension	File type
Agilent Bruker	.D (folder)	Agilent MassHunter, Agilent ChemStation, or Bruker BAF/YEP/TDF data format
Agilent/Bruker	.YEP	instrument data format
Bruker	.BAF	instrument data format
Bruker	.FID	instrument data format
Bruker	.TDF	timsTOF instrument data format
ABI/Sciex	.WIFF	instrument data format
ABI/Sciex	.t2d	4700 and 4800 file format
Waters	.PKL	MassLynx peak list format
Thermo PerkinElmer	.RAW*	Thermo Xcalibur PerkinElmer TurboMass
Micromass**/Waters	.RAW* (folder)	Waters MassLynx
Chromtech Finnigan*** VG	.DAT	Finnigan ITDS file format; MAT95 instrument data format MassLab data format
Finnigan***	.MS	ITS40 instrument data format
Shimadzu	.QGD	GCMSSolution format
Shimadzu	.qgd	instrument data format
Shimadzu	.lcd	QQQ/QTOF instrument data format
Shimadzu	.spc	library data format
Bruker/Varian	.SMS	instrument data format
Bruker/Varian	.XMS	instrument data format
ION-TOF	.itm	raw measurement data
ION-TOF	.ita	analysis data
Physical Electronics/ULVAC-PHI	.raw*	raw measurement data
Physical Electronics/ULVAC-PHI	.tdc	spectrum data

(*) Note that the RAW formats of each vendor are not interchangeable; software from one cannot handle the RAW files from another.
(**) Micromass was acquired by Waters in 1997
(***) Finnigan is a division of Thermo

Software[edit]

Viewers[edit]

There are several viewers for mzXML, mzML and mzData: MZmine,^[11] PEAKS,^[12]Insilicos,^[13] MS-Spectre,^[14] TOPPView (mzXML, mzML and mzData),^[15] Spectra Viewer,^[16] SeeMS,^[17] msInspect,^[18] jmzML^[19] and Mascot Distiller.^[20]

There is a viewer for ITA images.^[21] ITA and ITM images can be parsed with the pySPM python library.^[22]

Converters[edit]

Known converters for mzData to mzXML:

Hermes: A Java 'mzData, mzXML, mzML' converter to all directions: publicly available, runs with a graphical user interface, by the Institute of Molecular Systems Biology, ETH Zurich^[23]^[24]

FileConverter: A command line tool that converts to/from various mass spectrometry formats,^[25] part of TOPP^[26]

Known converters for mzXML:

The Institute for Systems Biology maintains a list of converters^[27]

Known converters for mzML:

msConvert:^[28]^[29] A command line tool converting to/from various mass spectrometry formats. A GUI is also available for Windows users.

ReAdW:^[30] The Institute for Systems Biology command line converter for Thermo RAW files, part of the TransProteomicPipeline.^[31] The latest update of this tool was made in September 2009. Users are now redirected by the TPP development team to use the msConvert software (see above).

FileConverter: A command line tool that converts to/from various mass spectrometry formats,^[25] part of TOPP^[26]

Converters for proprietary formats:

msConvert:^[28]^[29] A command line tool converting to/from various mass spectrometry formats including multiple proprietary formats. A GUI is also available for Windows users.

CompassXport, Bruker's free tool generating mzXML (and now mzData)^{[citation needed]} files for many of their native file formats (.baf).

MASSTransit, a software to change data between proprietary formats, by Palisade Corporation and distributed by Scientific Instrument Services, Inc^[32] and PerkinElmer^[33]

Aston,^[34] native support for several Agilent Chemstation, Agilent Masshunter and Thermo Isodat file formats

unfinnigan,^[35] native support for Finnigan (*.RAW) file formats

OpenChrom, an open source software with support to convert various native file formats

Currently available converters are :

MassWolf, for Micromass MassLynx .Raw format

mzStar, for SCIEX/ABI SCIEX/ABI Analyst format

wiff2dta^[36] for SCIEX/ABI SCIEX/ABI Analyst format to mzXML, DTA, MGF and PMF

References[edit]

^R.S. McDonald and P.A. Wilks; 'JCAMP-DX: A Standard Form for Exchange of Infrared Spectra in Computer-Readable Form'; Applied Spectroscopy, Vol. 42, No. 1, January 1988, pp 151-162.
^IUPAC CPEP Subcommittee on Electronic Data Standards
^JCAMP-DX V.6.00 for CHROMATOGRAPHY and MASS SPECTROMETRY HYPHENATED METHODS (IUPAC Technical Note 2005); J. Hau, P. Lampen, R.J. Lancashire, R.S. McDonald, P.S. McIntyre, D.N. Rutledge, W. Schrader, A.N. Davies
^ASTM E1947 – 98(2009) Standard Specification for Analytical Data Interchange Protocol for Chromatographic Data
^Orchard S, Montechi-Palazzi L, Deutsch EW, Binz PA, Jones AR, Paton N, Pizarro A, Creasy DM, Wojcik J, Hermjakob H (2007). 'Five years of progress in the Standardization of Proteomics Data 4(th) Annual Spring Workshop of the HUPO-Proteomics Standards Initiative April 23–25, 2007 Ecole Nationale Supérieure (ENS), Lyon, France'. Proteomics. 7 (19): 3436–40. doi:10.1002/pmic.200700658. PMID17907277.
^'mzData'. HUPO-PSI. Retrieved 19 April 2013.
^Pedrioli PG, Eng JK, Hubley R, Vogelzang M, Deutsch EW, Raught B, Pratt B, Nilsson E, Angeletti RH, Apweiler R, Cheung K, Costello CE, Hermjakob H, Huang S, Julian RK, Kapp E, McComb ME, Oliver SG, Omenn G, Paton NW, Simpson R, Smith R, Taylor CF, Zhu W, Aebersold R (2004). 'A common open representation of mass spectrometry data and its application to proteomics research'. Nat. Biotechnol. 22 (11): 1459–66. doi:10.1038/nbt1031. PMID15529173.
^Lin SM, Zhu L, Winter AQ, Sasinowski M, Kibbe WA (2005). 'What is mzXML good for?'. Expert Review of Proteomics. 2 (6): 839–45. doi:10.1586/14789450.2.6.839. PMID16307524.
^'mzML'. HUPO-Proteomics Standards Initiative. Retrieved 19 April 2013.
^Deutsch EW (2008). 'mzML: A single, unifying data format for mass spectrometer output'. Proteomics. 8 (14): 2776–7. doi:10.1002/pmic.200890049. PMID18655045.
^'MZmine website'.
^'BSI: PEAKS website'. Bioinfor.com. Retrieved 29 November 2011.
^Insilicos website
^'MS-Spectre website'. Ms-spectre.sourceforge.net. Retrieved 29 November 2011.
^'OpenMS and TOPP website'. Open-ms.sourceforge.net. Retrieved 29 November 2011.
^'An open source viewer developed under academic projects'. Staff.icar.cnr.it. Retrieved 29 November 2011.
^'An open source viewer developed by Matt Chambers at Vanderbilt'. Proteowizard.sourceforge.net. Retrieved 29 November 2011.
^'An open source viewer developed by at the Fred Hutchinson Cancer Center'. Proteomics.fhcrc.org. Retrieved 29 November 2011.
^'jmzML'. Retrieved 29 November 2011.
^Matrix Science Limited. 'Commercial software with free viewer mode for mzXML and many proprietary formats'. Matrixscience.com. Retrieved 29 November 2011.
^'ITAviewer online'.
'ITAviewer source'.
^'pySPM website'.
^HermesArchived 3 March 2016 at the Wayback Machine
^'Hermes website'. Icecoffee.ch. Retrieved 29 November 2011.
^ ^a^b'FileConverter'. Open-ms.sourceforge.net. Retrieved 29 November 2011.
^ ^a^bTOPPArchived 15 April 2008 at the Wayback Machine
^'mzXML'. Retrieved 30 June 2008.
^ ^a^b'msconvert'. ProteoWizard. Retrieved 20 April 2013.
^ ^a^b'ProteoWizard'. Retrieved 20 April 2013.
^'ReAdW'. Tools.proteomecenter.org. Retrieved 29 November 2011.
^'TransProteomicPipeline'. Tools.proteomecenter.org. 25 May 2011. Retrieved 29 November 2011.
^[1]Archived 9 May 2008 at the Wayback Machine
^'Gas Chromatography (GC)'. PerkinElmer. Retrieved 29 November 2011.
^aston - Open source chromatography and mass spectrometry software - Google Project Hosting
^unfinnigan - Painless extraction of mass spectra from Thermo 'raw' files - Google Project Hosting
^wiff2dta at sourceforge

Retrieved from 'https://en.wikipedia.org/w/index.php?title=Mass_spectrometry_data_format&oldid=935946809'