8000 Cannot append Pandas dataframe to existing array · Issue #592 · TileDB-Inc/TileDB-Py · GitHub
[go: up one dir, main page]

Skip to content
Cannot append Pandas dataframe to existing array #592
@Hoeze

Description

@Hoeze

Hi, I'm trying to write an array like this:

# +
import json

import tiledb
import numpy as np
import pandas as pd
import random
# -

test_df = pd.DataFrame.from_records(json.loads('{"chrom":{"0":"chr1","1":"chr1","2":"chr1","3":"chr1","4":"chr1","5":"chr1","8":"chr1","9":"chr1"},"log10_len":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":0,"9":0},"start":{"0":10108,"1":10108,"2":10108,"3":10108,"4":10108,"5":10108,"8":10143,"9":10143},"end":{"0":10114,"1":10114,"2":10114,"3":10114,"4":10114,"5":10114,"8":10144,"9":10144},"ref":{"0":"AACCCT","1":"AACCCT","2":"AACCCT","3":"AACCCT","4":"AACCCT","5":"AACCCT","8":"T","9":"T"},"alt":{"0":"A","1":"A","2":"A","3":"A","4":"A","5":"A","8":"C","9":"C"},"sample_id":{"0":"A","1":"B","2":"C","3":"D","4":"E","5":"F","8":"A","9":"B"},"GT":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":1,"9":1},"GQ":{"0":79,"1":39,"2":60,"3":99,"4":26,"5":62,"8":22,"9":65},"DP":{"0":12,"1":9,"2":39,"3":26,"4":9,"5":9,"8":35,"9":34}}'))
test_df

output_path="test.tdb"

ctx = tiledb.default_ctx()
ctx

# +
genotype_domain = tiledb.Domain(
    tiledb.Dim(name="chrom", domain=(None,None), tile=1, dtype=np.bytes_, ctx=ctx),
    tiledb.Dim(name="log10_len", domain=(0, np.iinfo(np.int8).max), tile=1, dtype=np.int8, ctx=ctx),
    tiledb.Dim(name="start", domain=(0, np.iinfo(np.int32).max), tile=100000, dtype=np.int32, ctx=ctx),
    tiledb.Dim(name="alt", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
#     tiledb.Dim(name="end", domain=(1, np.iinfo(np.int32).max), dtype=np.int32, ctx=ctx),
    tiledb.Dim(name="sample_id", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
    ctx=ctx,
)

string_filters = tiledb.FilterList([tiledb.ZstdFilter(level=-1),])
int_filters = tiledb.FilterList([tiledb.ByteShuffleFilter(), tiledb.ZstdFilter(level=-1),])
attrs = [
    tiledb.Attr(name='end', dtype='int32', var=False, nullable=False, filters=int_filters),
    tiledb.Attr(name='ref', dtype='S', nullable=False, filters=string_filters),
    tiledb.Attr(name='GT', dtype='int8', var=False, nullable=False, filters=int_filters),
    tiledb.Attr(name='GQ', dtype='int32', var=False, nullable=True, filters=int_filters),
    tiledb.Attr(name='DP', dtype='int32', var=False, nullable=True, filters=int_filters),
]
# -

schema = tiledb.ArraySchema(
    domain=genotype_domain,
    attrs=attrs,
    sparse=True,
    cell_order="hilbert",
#     capacity=10000,
    ctx=ctx,
)
schema

if not tiledb.array_exists(output_path):
    print("Creating array at '%s'..." % output_path)
    tiledb.array.SparseArray.create(output_path, schema, ctx=ctx)

tiledb.from_dataframe(output_path, test, sparse=True, mode="append")

However, the last line causes the following error:

---------------------------------------------------------------------------
TileDBError                               Traceback (most recent call last)
<ipython-input-84-d6af3de39a7d> in <module>
----> 1 tiledb.from_dataframe(output_path, test, sparse=True, mode="append")

/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in from_dataframe(uri, dataframe, **kwargs)
    485     )
    486 
--> 487     from_pandas(uri, dataframe, **kwargs)
    488 
    489 

/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in from_pandas(uri, dataframe, **kwargs)
    575                 dataframe, column_infos, tiledb_args.get("fillna")
    576             )
--> 577             _write_array(
    578                 uri,
    579                 dataframe,

/opt/anaconda/envs/tiledb/lib/python3.8/site-packages/tiledb/dataframe_.py in _write_array(uri, df, write_dict, nullmaps, create_array, row_start_idx, timestamp)
    649                     coords.append(df.index.get_level_values(k))
    650             # TODO ensure correct col/dim ordering
--> 651             libtiledb._setitem_impl_sparse(A, tuple(coords), write_dict, nullmaps)
    652 
    653         else:

tiledb/libtiledb.pyx in tiledb.libtiledb._setitem_impl_sparse()

tiledb/libtiledb.pyx in tiledb.libtiledb._write_array()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_ctx_err()

tiledb/libtiledb.pyx in tiledb.libtiledb._raise_tiledb_error()

TileDBError: [TileDB::Writer] Error: Cannot set buffer; Input attribute/dimension 'GQ' is nullable

Is there some mistake in my code?

PS: I had to set sparse=True in from_dataframe to be able to write, although the schema is already present.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions

      0