0

our nginx server coredump with a relatively low probability after we modified some codes, it coredump when call another function. i'm not sure what direct reason is, for example: nginx tried to read/write a wrong memory, but what is the exact wrong address and how to find out? here is the last 8 frames.

(gdb) bt
#0  0x00000000004e9bb8 in ngx_http_trailers_filter (r=0x1128e350, in=0x7ffe64678c60)
    at src/http/modules/ngx_http_headers_filter_module.c:264
#1  0x0000000000539754 in ngx_http_jflv_body_filter (r=0x1128e350, in=0x7ffe64678c60) at addon/jflv/ngx_http_jflv_module.c:263
#2  0x000000000053c931 in ngx_http_jbilling_body_filter (r=0x1128e350, in=0x7ffe64678c60)
    at addon/jbilling/ngx_http_jbilling_module.c:2192
#3  0x000000000053fef1 in ngx_http_jbilling_body_filter (r=0x1128e350, in=0x7ffe64678c60)
    at addon/thirdparty_billing/ngx_http_thirdparty_billing_module.c:2111
#4  0x0000000000549690 in ngx_http_sub_grep_body_filter (r=0x1128e350, in=0x7ffe64678c60)
    at addon/sub_grep_filter/ngx_http_sub_grep_filter_module.c:299
#5  0x000000000052cd5f in ngx_http_upstream_jconhash_body_filter (r=0x1128e350, chain=0x7ffe64678c60)
    at addon/upstream_jconhash/ngx_http_upstream_jconhash_module.c:1664
#6  0x00000000005310c3 in ngx_http_upstream_lb_chash_body_filter (r=0x1128e350, chain=0x7ffe64678c60)
    at addon/upstream_lb_chash/chash/ngx_http_upstream_lb_chash_module.c:993
#7  0x000000000053cc93 in ngx_http_file_signature_body_filter (r=0x1128e350, in=0x7ffe64678c60)
    at addon/file_signature_filter/ngx_http_file_signature_filter_module.c:203
#8  0x000000000054b1a8 in ngx_http_trim_body_filter (r=0x1128e350, in=<optimized out>)
    at addon/jtrim/ngx_http_trim_filter_module.c:365

the codes near src/http/modules/ngx_http_headers_filter_module.c:264

static ngx_int_t
ngx_http_trailers_filter(ngx_http_request_t *r, ngx_chain_t *in)
{
    ngx_str_t                 value;
    ngx_uint_t                i, safe_status;
    ngx_chain_t              *cl;
    ngx_table_elt_t          *t;
    ngx_http_header_val_t    *h;
    ngx_http_headers_conf_t  *conf;

    conf = ngx_http_get_module_loc_conf(r, ngx_http_headers_filter_module);

    if (in == NULL
        || conf->trailers == NULL
        || !r->expect_trailers
        || r->header_only)
    {
        return ngx_http_next_body_filter(r, in);        <== coredump here
    }

    ...
}

i also disassemble the last frame code

(gdb) disassemble
Dump of assembler code for function ngx_http_trailers_filter:
   0x00000000004e9aeb <+0>: push   %r15
   0x00000000004e9aed <+2>: push   %r14
   0x00000000004e9aef <+4>: push   %r13
   0x00000000004e9af1 <+6>: push   %r12
   0x00000000004e9af3 <+8>: push   %rbp
   0x00000000004e9af4 <+9>: push   %rbx
   0x00000000004e9af5 <+10>:    sub    $0x28,%rsp
   0x00000000004e9af9 <+14>:    mov    %rdi,%r12
   0x00000000004e9afc <+17>:    mov    %rsi,%r15
   0x00000000004e9aff <+20>:    mov    0x28(%rdi),%rax
   0x00000000004e9b03 <+24>:    mov    0x5e5ed6(%rip),%rdx        # 0xacf9e0 <ngx_http_headers_filter_module>
   0x00000000004e9b0a <+31>:    lea    (%rax,%rdx,8),%rax
   0x00000000004e9b0e <+35>:    test   %rsi,%rsi
   0x00000000004e9b11 <+38>:    je     0x4e9bac <ngx_http_trailers_filter+193>
   0x00000000004e9b17 <+44>:    mov    (%rax),%r14
   0x00000000004e9b1a <+47>:    mov    0x20(%r14),%rcx
   0x00000000004e9b1e <+51>:    test   %rcx,%rcx
   0x00000000004e9b21 <+54>:    je     0x4e9bac <ngx_http_trailers_filter+193>
   0x00000000004e9b27 <+60>:    movzbl 0x511(%rdi),%eax
   0x00000000004e9b2e <+67>:    and    $0xffffffc0,%eax
   0x00000000004e9b31 <+70>:    cmp    $0x80,%al
   0x00000000004e9b33 <+72>:    jne    0x4e9bac <ngx_http_trailers_filter+193>
   0x00000000004e9b35 <+74>:    mov    (%rsi),%rax
   0x00000000004e9b38 <+77>:    cmpb   $0x0,0x48(%rax)
   0x00000000004e9b3c <+81>:    js     0x4e9b57 <ngx_http_trailers_filter+108>
   0x00000000004e9b3e <+83>:    mov    %rsi,%rax
   0x00000000004e9b41 <+86>:    mov    0x8(%rax),%rax
--Type <RET> for more, q to quit, c to continue without paging--
   0x00000000004e9b45 <+90>:    test   %rax,%rax
   0x00000000004e9b48 <+93>:    je     0x4e9c6d <ngx_http_trailers_filter+386>
   0x00000000004e9b4e <+99>:    mov    (%rax),%rdx
   0x00000000004e9b51 <+102>:   cmpb   $0x0,0x48(%rdx)
   0x00000000004e9b55 <+106>:   jns    0x4e9b41 <ngx_http_trailers_filter+86>
   0x00000000004e9b57 <+108>:   mov    0x268(%r12),%rax
   0x00000000004e9b5f <+116>:   cmp    $0xce,%rax
   0x00000000004e9b65 <+122>:   je     0x4e9bc7 <ngx_http_trailers_filter+220>
   0x00000000004e9b67 <+124>:   ja     0x4e9c7e <ngx_http_trailers_filter+403>
   0x00000000004e9b6d <+130>:   mov    $0x0,%r13d
   0x00000000004e9b73 <+136>:   cmp    $0xc8,%rax
   0x00000000004e9b79 <+142>:   jb     0x4e9b97 <ngx_http_trailers_filter+172>
   0x00000000004e9b7b <+144>:   mov    $0x1,%r13d
   0x00000000004e9b81 <+150>:   cmp    $0xc9,%rax
   0x00000000004e9b87 <+156>:   jbe    0x4e9b97 <ngx_http_trailers_filter+172>
   0x00000000004e9b89 <+158>:   cmp    $0xcc,%rax
   0x00000000004e9b8f <+164>:   sete   %r13b
   0x00000000004e9b93 <+168>:   movzbl %r13b,%r13d
   0x00000000004e9b97 <+172>:   mov    (%rcx),%rbx
   0x00000000004e9b9a <+175>:   cmpq   $0x0,0x8(%rcx)
   0x00000000004e9b9f <+180>:   je     0x4e9c44 <ngx_http_trailers_filter+345>
   0x00000000004e9ba5 <+186>:   mov    $0x0,%ebp
   0x00000000004e9baa <+191>:   jmp    0x4e9c03 <ngx_http_trailers_filter+280>
   0x00000000004e9bac <+193>:   mov    %r15,%rsi
   0x00000000004e9baf <+196>:   mov    %r12,%rdi
   0x00000000004e9bb2 <+199>:   callq  *0x62aa70(%rip)        # 0xb14628 <ngx_http_next_body_filter>
=> 0x00000000004e9bb8 <+205>:   add    $0x28,%rsp
   0x00000000004e9bbc <+209>:   pop    %rbx
--Type <RET> for more, q to quit, c to continue without paging--
   0x00000000004e9bbd <+210>:   pop    %rbp
   0x00000000004e9bbe <+211>:   pop    %r12
   0x00000000004e9bc0 <+213>:   pop    %r13
   0x00000000004e9bc2 <+215>:   pop    %r14
   0x00000000004e9bc4 <+217>:   pop    %r15
   0x00000000004e9bc6 <+219>:   retq
   0x00000000004e9bc7 <+220>:   mov    $0x1,%r13d

rip point to 0x00000000004e9bb8, so the current executing instruction should be 0x00000000004e9bb2 callq *0x62aa70(%rip), i have searched some docs for what callq instruction is(eg: https://web.stanford.edu/class/archive/cs/cs107/cs107.1186/guide/x86-64.html), in the docs, it says:

The callq instruction takes one operand, the address of the function being called. It pushes the return address (current value of %rip, which is the next instruction after the call) onto the stack and then jumps to the address of the function being called.

it seems it only push ret addresss to stack and change the rip to the address of the function being called, it shouldn't make process coredump normally.

i also suspect another possibility: it actually coredump when ngx_http_next_body_filter execute retq. because 0x00000000004e9bac(mov %r15,%rsi) and 0x00000000004e9baf(mov %r12,%rdi) have just executed before coredump, so %15 and %rsi should be same(the in param in src code), %r12 and %rdi should be same too(the r param in src code), but the actual info in registers is not the value we expected

(gdb) i r
rax            0x0                 0
rbx            0x1128e350          287892304
rcx            0x1128e300          287892224
rdx            0x0                 0
rsi            0x0                 0
rdi            0xbe                190
rbp            0x0                 0x0
rsp            0x7ffe64678700      0x7ffe64678700
r8             0xbe                190
r9             0x7ffe64677ec0      140730582924992
r10            0x10d7f170          282587504
r11            0x246               582
r12            0x1128e350          287892304
r13            0x7ffe64678c60      140730582928480
r14            0x3148b28           51677992
r15            0x7ffe64678c60      140730582928480
rip            0x4e9bb8            0x4e9bb8 <ngx_http_trailers_filter+205>
eflags         0x10202             [ IF RF ]
cs             0x33                51
ss             0x2b                43
ds             0x0                 0
es             0x0                 0
fs             0x0                 0
gs             0x0                 0

so maybe the rsi and rdi have been modified by ngx_http_next_body_filter, and it actually coredump when ngx_http_next_body_filter execute retq, is it possible?

is there some advices to anylyse this question?

i also have other questions:

  1. normally, what kinds of codes would coredump when call another function.
  2. is there some docs explain similar situation
  • Could you possibly compile your code with `-fsanitize=address` [ref.](https://clang.llvm.org/docs/AddressSanitizer.html)? – Zoso Apr 10 '21 at 16:09
  • we always use gcc for our compiler, and never use clang to compile out nginx before. i can have a try. besides, it only core dump twice on one of our 50 servers in 5 days and it is hard to reproduce on our test environment. – Kevin Gu Apr 10 '21 at 16:36
  • My bad for pointing to CLANG documentation. See [this](https://stackoverflow.com/a/40215639/1851678) for using ASAN with GCC – Zoso Apr 10 '21 at 16:40
  • It could be due to an earlier stack overflow overwriting a saved base pointer that is than written into the stack pointer. At that point, `call` or `ret` could cause the segfault. What is the stack pointer from the coredump? – mkayaalp Apr 10 '21 at 19:10
  • Also, replace [tag:c++] and [tag:disassembly] tags with [tag:segmentation-fault]. – mkayaalp Apr 10 '21 at 19:15

0 Answers0