Hello Brian!
Thanks for the reply!
I checked the schema and added it, unfortunately it does not seem to solve the issue.
I am attaching sampleblib.blib
which is the one I am generating. [file was too large, link here https://github.com/jspaezp/burner_repo/releases/download/v0.0.1/sampleblib.blib]
I am also attaching sky.blib
which was generated following the diaPASEF tutorial with the 'small data' and then modified using the script below. Oddly enough, skyline does identify the IMS data in this file correctly after the values have been over-written by my script.
This is the script I am using to add the mobility info (pardon the messy code). Let me know if you need the 'weights files' as well!
import sqlite3
import pandas as pd
import argparse
import lightgbm as lgbm
pd.set_option("display.max_columns", None)
REF_SPECTRA_SCHEMA = """
CREATE TABLE RefSpectra ( -- spectrum metadata - actual mz/intensity pairs in RefSpectraPeaks
id INTEGER primary key autoincrement not null, -- lookup key for RefSpectraPeaks
peptideSeq VARCHAR(150), -- unmodified peptide sequence, can be left blank for small molecule use
precursorMZ REAL, -- mz of the precursor that produced this spectrum
precursorCharge INTEGER, -- should agree with adduct if provided
peptideModSeq VARCHAR(200), -- modified peptide sequence, can be left blank for small molecule use
prevAA CHAR(1), -- position of peptide in its parent protein (can be left blank)
nextAA CHAR(1), -- position of peptide in its parent protein (can be left blank)
copies INTEGER, -- number of copies this spectrum was chosen from if it is in a filtered library
numPeaks INTEGER, -- number of peaks, should agree with corresponding entry in RefSpectraPeaks
ionMobility REAL, -- ion mobility value, if known (see ionMobilityType for units)
collisionalCrossSectionSqA REAL, -- precursor CCS in square Angstroms for ion mobility, if known
ionMobilityHighEnergyOffset REAL, -- ion mobility value increment for fragments (see ionMobilityType for units)
ionMobilityType TINYINT, -- ion mobility units (required if ionMobility is used, see IonMobilityTypes table for key)
retentionTime REAL, -- chromatographic retention time in minutes, if known
startTime REAL, -- start retention time in minutes, if known
endTime REAL, -- end retention time in minutes, if known
totalIonCurrent REAL, -- total ion current of spectrum
moleculeName VARCHAR(128), -- precursor molecule's name (not needed for peptides)
chemicalFormula VARCHAR(128), -- precursor molecule's neutral formula (not needed for peptides)
precursorAdduct VARCHAR(128), -- ionizing adduct e.g. [M+Na], [2M-H2O+2H] etc (not needed for peptides)
inchiKey VARCHAR(128), -- molecular identifier for structure retrieval (not needed for peptides)
otherKeys VARCHAR(128), -- alternative molecular identifiers for structure retrieval, tab separated name:value pairs e.g. cas:58-08-2\thmdb:01847 (not needed for peptides)
fileID INTEGER, -- index into SpectrumSourceFiles table for source file information
SpecIDinFile VARCHAR(256), -- original spectrum label, id, or description in source file
score REAL, -- spectrum score, typically a probability score (see scoreType)
scoreType TINYINT -- spectrum score type, see ScoreTypes table for meaning
);
CREATE INDEX idxPeptide ON RefSpectra (peptideSeq, precursorCharge);
CREATE INDEX idxPeptideMod ON RefSpectra (peptideModSeq, precursorCharge);
CREATE INDEX idxMoleculeName ON RefSpectra (moleculeName, precursorAdduct);
CREATE INDEX idxInChiKey ON RefSpectra (inchiKey, precursorAdduct);
"""
RT_SCHEMA = """
CREATE TABLE IF NOT EXISTS "RetentionTimes" (
"RefSpectraID" INTEGER,
"RedundantRefSpectraID" INTEGER,
"SpectrumSourceID" INTEGER,
"ionMobility" REAL,
"collisionalCrossSectionSqA" REAL,
"ionMobilityHighEnergyOffset" REAL,
"ionMobilityType" INTEGER,
"retentionTime" REAL,
"startTime" TEXT,
"endTime" TEXT,
"score" REAL,
"bestSpectrum" INTEGER
);
"""
def main(blib, one_over_k0_model_file, ccs_model_file):
conn = sqlite3.connect(blib)
cur = conn.cursor()
print("Reading from file")
df = pd.read_sql_query("SELECT * FROM RefSpectra", conn)
print(df.head())
pred_df = df[["id", "peptideSeq", "precursorMZ", "precursorCharge"]].copy()
pred_df["PepLength"] = pred_df["peptideSeq"].str.len()
pred_df["NumBulky"] = pred_df["peptideSeq"].str.count("[LVIFWY]")
pred_df["NumPos"] = pred_df["peptideSeq"].str.count("[KRH]")
pred_df["NumNeg"] = pred_df["peptideSeq"].str.count("[DE]")
print("Predicting ion mobility")
######### Prediction Start
one_over_k0_model = lgbm.Booster(model_file=one_over_k0_model_file)
ccs_model = lgbm.Booster(model_file=ccs_model_file)
ook0_predicted = one_over_k0_model.predict(
pred_df[
[
"precursorMZ",
"precursorCharge",
"PepLength",
"NumBulky",
"NumPos",
"NumNeg",
]
]
)
ccs_predicted = ccs_model.predict(
pred_df[
[
"precursorMZ",
"precursorCharge",
"PepLength",
"NumBulky",
"NumPos",
"NumNeg",
]
]
)
id_to_imns = dict(zip(pred_df["id"], ook0_predicted))
id_to_ccs = dict(zip(pred_df["id"], ccs_predicted))
####### Prediction End
try:
del df["ionMobilityValue"]
except KeyError:
pass
df["collisionalCrossSectionSqA"] = ccs_predicted
df["ionMobility"] = ook0_predicted
df["ionMobilityHighEnergyOffset"] = 0
df["ionMobilityHighEnergyOffset"] = df["ionMobilityHighEnergyOffset"].astype("float64")
df["ionMobilityType"] = 2
print("Writing to file")
print("Updating RefSpectra Table")
cur.execute("DROP TABLE RefSpectra")
cur.executescript(REF_SPECTRA_SCHEMA)
df.to_sql("RefSpectra", conn, if_exists="append", index=False, schema=REF_SPECTRA_SCHEMA)
print("Updating RetentionTimes Table")
rtdf = pd.read_sql_query("SELECT * FROM RetentionTimes", conn)
try:
del rtdf["ionMobilityValue"]
except KeyError:
pass
rtdf["ionMobility"] = [id_to_imns[x] for x in rtdf["RefSpectraID"]]
rtdf["collisionalCrossSectionSqA"] = [id_to_ccs[x] for x in rtdf["RefSpectraID"]]
rtdf["ionMobilityType"] = 2
rtdf["ionMobilityHighEnergyOffset"] = 0
rtdf["ionMobilityHighEnergyOffset"] = rtdf["ionMobilityHighEnergyOffset"].astype("float64")
cur.execute("DROP TABLE RetentionTimes")
cur.executescript(RT_SCHEMA)
rtdf.to_sql("RetentionTimes", conn, if_exists="replace", index=False, schema=RT_SCHEMA)
# Just a print for sanity checking
print("Printing header of the new table")
df = pd.read_sql_query("SELECT * FROM RefSpectra LIMIT 5", conn)
print(df.head())
df = pd.read_sql_query("SELECT * FROM RetentionTimes LIMIT 5", conn)
print(df.head())
print("Creating ims info table")
# Encyclopedia does not write this table, so we add it here
try:
cur.executescript(
"""
CREATE TABLE IonMobilityTypes (
id INTEGER PRIMARY KEY,
ionMobilityType VARCHAR(128)
);
INSERT INTO IonMobilityTypes(id, ionMobilityType) VALUES(0, 'none');
INSERT INTO IonMobilityTypes(id, ionMobilityType) VALUES(1, 'driftTime(msec)');
INSERT INTO IonMobilityTypes(id, ionMobilityType) VALUES(2, 'inverseK0(Vsec/cm^2)');
INSERT INTO IonMobilityTypes(id, ionMobilityType) VALUES(3, 'compensation(V)');
"""
)
except sqlite3.OperationalError as e:
if "table IonMobilityTypes already exists" in str(e):
pass
else:
raise
conn.commit()
conn.close()
print("Done!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Add IMS to BLIB")
parser.add_argument("blib", help="BLIB file")
parser.add_argument(
"one_over_k0_model_file",
help="Weights file with the IMS model to use for the lgbm model that predicts 1/k0",
)
parser.add_argument(
"ccs_model_file",
help="Weights file with the IMS model to use for the lgbm model that predicts CCS",
)
args, unknown = parser.parse_known_args()
if unknown:
raise RuntimeError("Unrecognized arguments: ", unknown)
else:
main(
args.blib,
one_over_k0_model_file=args.one_over_k0_model_file,
ccs_model_file=args.ccs_model_file,
)
On the other hand, is there any place where the .imsdb
schema is documented? Opening one manually I had questions on several of the fields.
Thank you very much for the help and time!
Sebastian