-
Notifications
You must be signed in to change notification settings - Fork 36
Open
Description
Hi, I'm trying to write an array like this:
# +
import json
import tiledb
import numpy as np
import pandas as pd
import random
# -
test_df = pd.DataFrame.from_records(json.loads('{"chrom":{"0":"chr1","1":"chr1","2":"chr1","3":"chr1","4":"chr1","5":"chr1","8":"chr1","9":"chr1"},"log10_len":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":0,"9":0},"start":{"0":10108,"1":10108,"2":10108,"3":10108,"4":10108,"5":10108,"8":10143,"9":10143},"end":{"0":10114,"1":10114,"2":10114,"3":10114,"4":10114,"5":10114,"8":10144,"9":10144},"ref":{"0":"AACCCT","1":"AACCCT","2":"AACCCT","3":"AACCCT","4":"AACCCT","5":"AACCCT","8":"T","9":"T"},"alt":{"0":"A","1":"A","2":"A","3":"A","4":"A","5":"A","8":"C","9":"C"},"sample_id":{"0":"A","1":"B","2":"C","3":"D","4":"E","5":"F","8":"A","9":"B"},"GT":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":1,"9":1},"GQ":{"0":79,"1":39,"2":60,"3":99,"4":26,"5":62,"8":22,"9":65},"DP":{"0":12,"1":9,"2":39,"3":26,"4":9,"5":9,"8":35,"9":34}}'))
test_df
output_path="test.tdb"
ctx = tiledb.default_ctx()
ctx
# +
genotype_domain = tiledb.Domain(
tiledb.Dim(name="chrom", domain=(None,None), tile=1, dtype=np.bytes_, ctx=ctx),
tiledb.Dim(name="log10_len", domain=(0, np.iinfo(np.int8).max), tile=1, dtype=np.int8, ctx=ctx),
tiledb.Dim(name="start", domain=(0, np.iinfo(np.int32).max), tile=100000, dtype=np.int32, ctx=ctx),
tiledb.Dim(name="alt", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
# tiledb.Dim(name="end", domain=(1, np.iinfo(np.int32).max), dtype=np.int32, ctx=ctx),
tiledb.Dim(name="sample_id", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
ctx=ctx,
)
string_filters = tiledb.FilterList([tiledb.ZstdFilter(level=-1),])
int_filters = tiledb.FilterList([tiledb.ByteShuffleFilter(), tiledb.ZstdFilter(level=-1),])
attrs = [
tiledb.Attr(name='end', dtype='int32', var=False, nullable=False, filters=int_filters),
tiledb.Attr(name='ref', dtype='S', nullable=False, filters=string_filters),
tiledb.Attr(name='GT', dtype='int8', var=False, nullable=False, filters=int_filters),
tiledb.Attr(name='GQ', dtype='int32', var=False, nullable=True, filters=int_filters),
tiledb.Attr(name='DP', dtype='int32', var=False, nullable=True, filters=int_filters),
]
# -
schema = tiledb.ArraySchema(
domain=genotype_domain,
attrs=attrs,
sparse=True,
cell_order="hilbert",
# capacity=10000,
ctx=ctx,
)
schema
if not tiledb.array_exists(output_path):
print("Creating array at '%s'..." % output_path)
tiledb.array.SparseArray.create(output_path, schema, ctx=ctx)
tiledb.from_dataframe(output_path, test, sparse=True, mode="append")
However, the last line causes the following error:
---------------------------------------------------------------------------
TileDBError Traceback (most recent call last)
<ipython-input-84-d6af3de39a7d> in <module>
----> 1 tiledb.from_dataframe(output_path, test, sparse=True, mode="append")
/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in from_dataframe(uri, dataframe, **kwargs)
485 )
486
--> 487 from_pandas(uri, dataframe, **kwargs)
488
489
/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in from_pandas(uri, dataframe, **kwargs)
575 dataframe, column_infos, tiledb_args.get("fillna")
576 )
--> 577 _write_array(
578 uri,
579 dataframe,
/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in _write_array(uri, df, write_dict, nullmaps, create_array, row_start_idx, timestamp)
649 coords.append(df.index.get_level_values(k))
650 # TODO ensure correct col/dim ordering
--> 651 libtiledb._setitem_impl_sparse(A, tuple(coords), write_dict, nullmaps)
652
653 else:
tiledb/libtiledb.pyx in tiledb.libtiledb._setitem_impl_sparse()
tiledb/libtiledb.pyx in tiledb.libtiledb._write_array()
tiledb/libtiledb.pyx in tiledb.libtiledb._raise_ctx_err()
tiledb/libtiledb.pyx in tiledb.libtiledb._raise_tiledb_error()
TileDBError: [TileDB::Writer] Error: Cannot set buffer; Input attribute/dimension 'GQ' is nullable
Is there some mistake in my code?
PS: I had to set sparse=True
in from_dataframe
to be able to write, although the schema is already present.
Metadata
Metadata
Assignees
Labels
No labels