You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
这是tgi运行的输出,通过url请求后得到
这是vllm版本不匹配吗?
添加v_scale 1.0后得到报错:
2024-07-29T09:36:56.399093Z INFO text_generation_router: router/src/main.rs:369: Connected
2024-07-29T09:36:59.652787Z ERROR text_generation_launcher: Method Decode encountered an error.
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in
sys.exit(app())
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in __call__
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/cli.py", line 90, in serve
server.serve(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 264, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
> File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/interceptor.py", line 21, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 183, in Decode
generations, next_batch, timings = self.model.generate_token(batch)
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1035, in generate_token
raise e
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1032, in generate_token
out, speculative_logits = self.forward(batch)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/vlm_causal_lm.py", line 333, in forward
logits, speculative_logits = self.model.forward(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/cogvlm2.py", line 115, in forward
hidden_states = self.language_model.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 400, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 319, in forward
attn_output = self.self_attn(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 183, in forward
paged_attention.attention(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/utils/paged_attention.py", line 122, in attention
ops.paged_attention_v2(
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 34, in wrapper
return fn(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 127, in paged_attention_v2
torch.ops._C.paged_attention_v2(
File "/opt/conda/lib/python3.10/site-packages/torch/_ops.py", line 854, in __call__
return self_._op(*args, **(kwargs or {}))
RuntimeError: _C::paged_attention_v2() Expected a value of type 'int' for argument 'num_kv_heads' but instead found type 'Tensor'.
Position: 7
Value: tensor([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
6, 6, 6, 6, 7, 7, 7, 7], device='cuda:0', dtype=torch.int32)
Declaration: _C::paged_attention_v2(Tensor($0! -> ) out, Tensor exp_sums, Tensor max_logits, Tensor tmp_out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor seq_lens, int block_size, int max_seq_len, Tensor? alibi_slopes, str kv_cache_dtype, float k_scale, float v_scale, int tp_rank, int blocksparse_local_blocks, int blocksparse_vert_stride, int blocksparse_block_size, int blocksparse_head_sliding_step) -> ()
Cast error details: Unable to cast Python instance of type to C++ type '?' (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)
2024-07-29T09:36:59.962808Z ERROR batch{batch_size=1}:decode:decode{size=1}:decode{size=1}: text_generation_client: router/client/src/lib.rs:33: Server error: CANCELLED
2024-07-29T09:37:00.906210Z ERROR batch{batch_size=1}:decode:clear_cache{batch_id=Some(0)}:clear_cache{batch_id=Some(0)}: text_generation_client: router/client/src/lib.rs:33: Server error: transport error
2024-07-29T09:37:00.906280Z ERROR chat_completions:generate:generate_stream:infer:send_error: text_generation_router::infer: router/src/infer.rs:871: Request failed during generation: Server error: CANCELLED
2024-07-29T09:37:00.992939Z ERROR shard-manager: text_generation_launcher: Shard complete standard error output:
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type cogvlm2 to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
/opt/conda/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:658: UserWarning: You are using a Backend <class 'text_generation_server.utils.dist.FakeGroup'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
Exception ignored in: <function Server.del at 0x7f0e0decbeb0>
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/grpc/aio/_server.py", line 194, in del
cygrpc.schedule_coro_threadsafe(
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi", line 120, in grpc._cython.cygrpc.schedule_coro_threadsafe
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi", line 112, in grpc._cython.cygrpc.schedule_coro_threadsafe
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 436, in create_task
self._check_closed()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 515, in _check_closed
raise RuntimeError('Event loop is closed')
RuntimeError: Event loop is closed
sys:1: RuntimeWarning: coroutine 'AioServer.shutdown' was never awaited
Task exception was never retrieved
future: <Task finished name='HandleExceptions[/generate.v2.TextGenerationService/Decode]' coro=<()> exception=SystemExit(1)>
Traceback (most recent call last):
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/interceptor.py", line 21, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 183, in Decode
generations, next_batch, timings = self.model.generate_token(batch)
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1035, in generate_token
raise e
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1032, in generate_token
out, speculative_logits = self.forward(batch)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/vlm_causal_lm.py", line 333, in forward
logits, speculative_logits = self.model.forward(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/cogvlm2.py", line 115, in forward
hidden_states = self.language_model.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 400, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 319, in forward
attn_output = self.self_attn(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 183, in forward
paged_attention.attention(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/utils/paged_attention.py", line 122, in attention
ops.paged_attention_v2(
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 34, in wrapper
return fn(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 127, in paged_attention_v2
torch.ops._C.paged_attention_v2(
File "/opt/conda/lib/python3.10/site-packages/torch/ops.py", line 854, in call
return self._op(*args, **(kwargs or {}))
RuntimeError: _C::paged_attention_v2() Expected a value of type 'int' for argument 'num_kv_heads' but instead found type 'Tensor'.
Position: 7
Value: tensor([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
6, 6, 6, 6, 7, 7, 7, 7], device='cuda:0', dtype=torch.int32)
Declaration: _C::paged_attention_v2(Tensor($0! -> ) out, Tensor exp_sums, Tensor max_logits, Tensor tmp_out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor seq_lens, int block_size, int max_seq_len, Tensor? alibi_slopes, str kv_cache_dtype, float k_scale, float v_scale, int tp_rank, int blocksparse_local_blocks, int blocksparse_vert_stride, int blocksparse_block_size, int blocksparse_head_sliding_step) -> ()
Cast error details: Unable to cast Python instance of type <class 'torch.Tensor'> to C++ type '?' (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in call
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/cli.py", line 90, in serve
server.serve(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 264, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 702, in _handle_exceptions
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 689, in grpc._cython.cygrpc._handle_exceptions
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 821, in _handle_rpc
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 554, in _handle_unary_unary_rpc
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 408, in _finish_handler_with_unary_response
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/interceptor.py", line 28, in intercept
exit(1)
File "/opt/conda/lib/python3.10/_sitebuiltins.py", line 26, in call
raise SystemExit(code)
SystemExit: 1 rank=0
2024-07-29T09:37:01.055481Z ERROR text_generation_launcher: Shard 0 crashed
2024-07-29T09:37:01.055512Z INFO text_generation_launcher: Terminating webserver
2024-07-29T09:37:01.055536Z INFO text_generation_launcher: Waiting for webserver to gracefully shutdown
2024-07-29T09:37:01.055705Z INFO text_generation_router::server: router/src/server.rs:1754: signal received, starting graceful shutdown
2024-07-29T09:37:01.155713Z INFO text_generation_launcher: webserver terminated
2024-07-29T09:37:01.155751Z INFO text_generation_launcher: Shutting down shards
The text was updated successfully, but these errors were encountered:
ZZZ1223
changed the title
使用TGI推理cogvlm2,无法通过url调用
使用TGI推理cogvlm2,url
Jul 29, 2024
ZZZ1223
changed the title
使用TGI推理cogvlm2,url
使用TGI推理cogvlm2,url调用报错
Jul 29, 2024
The argument
trust_remote_code
is to be used with Auto classes. It has no effect here and is ignored.Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type cogvlm2 to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
/opt/conda/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:658: UserWarning: You are using a Backend <class 'text_generation_server.utils.dist.FakeGroup'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
Exception ignored in: <function Server.del at 0x7f0e0decbeb0>
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/grpc/aio/_server.py", line 194, in del
cygrpc.schedule_coro_threadsafe(
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi", line 120, in grpc._cython.cygrpc.schedule_coro_threadsafe
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi", line 112, in grpc._cython.cygrpc.schedule_coro_threadsafe
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 436, in create_task
self._check_closed()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 515, in _check_closed
raise RuntimeError('Event loop is closed')
RuntimeError: Event loop is closed
sys:1: RuntimeWarning: coroutine 'AioServer.shutdown' was never awaited
Task exception was never retrieved
future: <Task finished name='HandleExceptions[/generate.v2.TextGenerationService/Decode]' coro=<()> exception=SystemExit(1)>
Traceback (most recent call last):
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/interceptor.py", line 21, in intercept
return await response
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 82, in _unary_interceptor
raise error
File "/opt/conda/lib/python3.10/site-packages/opentelemetry/instrumentation/grpc/_aio_server.py", line 73, in _unary_interceptor
return await behavior(request_or_iterator, context)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 183, in Decode
generations, next_batch, timings = self.model.generate_token(batch)
File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1035, in generate_token
raise e
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/flash_causal_lm.py", line 1032, in generate_token
out, speculative_logits = self.forward(batch)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/vlm_causal_lm.py", line 333, in forward
logits, speculative_logits = self.model.forward(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/cogvlm2.py", line 115, in forward
hidden_states = self.language_model.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 400, in forward
hidden_states, residual = layer(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 319, in forward
attn_output = self.self_attn(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/models/custom_modeling/flash_cogvlm2_modeling.py", line 183, in forward
paged_attention.attention(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/utils/paged_attention.py", line 122, in attention
ops.paged_attention_v2(
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 34, in wrapper
return fn(*args, **kwargs)
File "/home/nfs/zhangzz/code/TGI/vllm/vllm/_custom_ops.py", line 127, in paged_attention_v2
torch.ops._C.paged_attention_v2(
File "/opt/conda/lib/python3.10/site-packages/torch/ops.py", line 854, in call
return self._op(*args, **(kwargs or {}))
RuntimeError: _C::paged_attention_v2() Expected a value of type 'int' for argument 'num_kv_heads' but instead found type 'Tensor'.
Position: 7
Value: tensor([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
6, 6, 6, 6, 7, 7, 7, 7], device='cuda:0', dtype=torch.int32)
Declaration: _C::paged_attention_v2(Tensor($0! -> ) out, Tensor exp_sums, Tensor max_logits, Tensor tmp_out, Tensor query, Tensor key_cache, Tensor value_cache, int num_kv_heads, float scale, Tensor block_tables, Tensor seq_lens, int block_size, int max_seq_len, Tensor? alibi_slopes, str kv_cache_dtype, float k_scale, float v_scale, int tp_rank, int blocksparse_local_blocks, int blocksparse_vert_stride, int blocksparse_block_size, int blocksparse_head_sliding_step) -> ()
Cast error details: Unable to cast Python instance of type <class 'torch.Tensor'> to C++ type '?' (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 311, in call
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1157, in call
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.10/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/cli.py", line 90, in serve
server.serve(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/server.py", line 264, in serve
asyncio.run(
File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 702, in _handle_exceptions
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 689, in grpc._cython.cygrpc._handle_exceptions
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 821, in _handle_rpc
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 554, in _handle_unary_unary_rpc
File "src/python/grpcio/grpc/_cython/_cygrpc/aio/server.pyx.pxi", line 408, in _finish_handler_with_unary_response
File "/opt/conda/lib/python3.10/site-packages/grpc_interceptor/server.py", line 165, in invoke_intercept_method
return await self.intercept(
File "/home/nfs/zhangzz/code/TGI/cogvlm2/server/text_generation_server/interceptor.py", line 28, in intercept
exit(1)
File "/opt/conda/lib/python3.10/_sitebuiltins.py", line 26, in call
raise SystemExit(code)
SystemExit: 1 rank=0
2024-07-29T09:37:01.055481Z ERROR text_generation_launcher: Shard 0 crashed
2024-07-29T09:37:01.055512Z INFO text_generation_launcher: Terminating webserver
2024-07-29T09:37:01.055536Z INFO text_generation_launcher: Waiting for webserver to gracefully shutdown
2024-07-29T09:37:01.055705Z INFO text_generation_router::server: router/src/server.rs:1754: signal received, starting graceful shutdown
2024-07-29T09:37:01.155713Z INFO text_generation_launcher: webserver terminated
2024-07-29T09:37:01.155751Z INFO text_generation_launcher: Shutting down shards
The text was updated successfully, but these errors were encountered: