22

I have a Go1.5.1 process/app. When I run /usr/sbin/lsof -p on that process, I see a lot of "can't identify protocol".

monitor_ 13105 root  101u  sock      0,6      0t0 16960100 can't identify protocol
monitor_ 13105 root  102u  sock      0,6      0t0 21552427 can't identify protocol
monitor_ 13105 root  103u  sock      0,6      0t0 17565091 can't identify protocol
monitor_ 13105 root  104u  sock      0,6      0t0 18476870 can't identify protocol

proc status/limit/fd

[root@Monitor_q ~]# cat /proc/13105/status 
Name:   monitor_client
State:  S (sleeping)
Tgid:   13105
Pid:    13105
PPid:   13104
TracerPid:  0
Uid:    0   0   0   0
Gid:    0   0   0   0
Utrace: 0
FDSize: 16384
Groups: 
...


[root@Monitor_q ~]# cat /proc/13105/limits 
Limit                     Soft Limit           Hard Limit           Units     
Max cpu time              unlimited            unlimited            seconds   
Max file size             unlimited            unlimited            bytes     
Max data size             unlimited            unlimited            bytes     
Max stack size            10485760             unlimited            bytes     
Max core file size        0                    unlimited            bytes     
Max resident set          unlimited            unlimited            bytes     
Max processes             3870                 3870                 processes 
Max open files            9999                 9999                 files     
Max locked memory         65536                65536                bytes     
Max address space         unlimited            unlimited            bytes     
Max file locks            unlimited            unlimited            locks     
Max pending signals       3870                 3870                 signals   
Max msgqueue size         819200               819200               bytes     
Max nice priority         0                    0                    
Max realtime priority     0                    0                    
Max realtime timeout      unlimited            unlimited            us

[root@Monitor_q ~]# ll /proc/13105/fd/
lrwx------ 1 root root 64 Dec  7 00:15 8382 -> socket:[52023221]
lrwx------ 1 root root 64 Dec  7 00:15 8383 -> socket:[51186627]
lrwx------ 1 root root 64 Dec  7 00:15 8384 -> socket:[51864232]
lrwx------ 1 root root 64 Dec  7 00:15 8385 -> socket:[52435453]
lrwx------ 1 root root 64 Dec  7 00:15 8386 -> socket:[51596071]
lrwx------ 1 root root 64 Dec  7 00:15 8387 -> socket:[52767667]
lrwx------ 1 root root 64 Dec  7 00:15 8388 -> socket:[52090632]
lrwx------ 1 root root 64 Dec  7 00:15 8389 -> socket:[51739068]
lrwx------ 1 root root 64 Dec  7 00:15 839 -> socket:[22963529]
lrwx------ 1 root root 64 Dec  7 00:15 8390 -> socket:[52023223]
lrwx------ 1 root root 64 Dec  7 00:15 8391 -> socket:[52560389]
lrwx------ 1 root root 64 Dec  7 00:15 8392 -> socket:[52402565]
...

but there is no similar output in netstat -a.

What are these sockets and how can I find out what they do?

monitor_client.go

package main

import (
    "crypto/tls"
    "encoding/json"
    "fmt"
    "log"
    "net"
    "net/http"
    nurl "net/url"
    "strconv"
    "strings"
    "syscall"
    "time"
)

type Result struct {
    Error      string        `json:"error"`
    HttpStatus int           `json:"http_status"`
    Stime      time.Duration `json:"http_time"`
}

//http://stackoverflow.com/questions/20990332/golang-http-timeout-and-goroutines-accumulation
//http://3.3.3.3/http?host=3.2.4.2&servername=a.test&path=/&port=33&timeout=5&scheme=http
func MonitorHttp(w http.ResponseWriter, r *http.Request) {
    var host, servername, path, port, scheme string
    var timeout int
    u, err := nurl.Parse(r.RequestURI)
    if err != nil {
        log.Fatal(err)
        return
    }
    if host = u.Query().Get("host"); host == "" {
        host = "127.0.0.0"
    }
    if servername = u.Query().Get("servername"); servername == "" {
        servername = "localhost"
    }
    if path = u.Query().Get("path"); path == "" {
        path = "/"
    }
    if port = u.Query().Get("port"); port == "" {
        port = "80"
    }
    if scheme = u.Query().Get("scheme"); scheme == "" {
        scheme = "http"
    }

    if timeout, _ = strconv.Atoi(u.Query().Get("timeout")); timeout == 0 {
        timeout = 5
    }

    //log.Printf("(host)=%s (servername)=%s (path)=%s (port)=%s (timeout)=%d", host, servername, path, port, timeout)

    w.Header().Set("Content-Type", "application/json")

    res := httptool(host, port, servername, scheme, path, timeout)
    result, _ := json.Marshal(res)
    fmt.Fprintf(w, "%s", result)
}

func httptool(ip, port, servername, scheme, path string, timeout int) Result {

    var result Result
    startTime := time.Now()
    host := ip + ":" + port

    transport := &http.Transport{
        TLSClientConfig:   &tls.Config{InsecureSkipVerify: true},
        DisableKeepAlives: true,
    }

    dialer := net.Dialer{
        Timeout:   time.Duration(timeout) * time.Second,
        KeepAlive: 0 * time.Second,
    }
    transport.Dial = func(network, address string) (net.Conn, error) {
        return dialer.Dial(network, address)
    }

    client := &http.Client{
        Transport: transport,
    }
    rawquery := ""
    url := fmt.Sprintf("%s://%s%s%s", scheme, host, path, rawquery)
    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
        result.HttpStatus = -1
        errs := strings.Split(err.Error(), ": ")
        result.Error = errs[len(errs)-1]
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    req.Header.Set("User-Agent", "monitor worker")
    req.Header.Set("Connection", "close")
    req.Host = servername
    resp, err := client.Do(req)
    //https://github.com/Basiclytics/neverdown/blob/master/check.go
    if err != nil {
        nerr, ok := err.(*nurl.Error)
        if ok {
            switch cerr := nerr.Err.(type) {
            case *net.OpError:
                switch cerr.Err.(type) {
                case *net.DNSError:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "dns: " + errs[len(errs)-1]
                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "server: " + errs[len(errs)-1]
                }
            default:
                switch nerr.Err.Error() {
                case "net/http: request canceled while waiting for connection":
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "timeout: " + errs[len(errs)-1]

                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "unknown: " + errs[len(errs)-1]
                }
            }

        } else {
            result.Error = "unknown: " + err.Error()
        }
        result.HttpStatus = -2
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    resp.Body.Close()
    result.HttpStatus = resp.StatusCode
    result.Error = "noerror"
    result.Stime = time.Now().Sub(startTime) / time.Millisecond //spend time (ms)
    return result
}

func setRlimit() {
    var rLimit syscall.Rlimit
    err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit)
    if err != nil {
        log.Printf("Unable to obtain rLimit", err)
    }
    if rLimit.Cur < rLimit.Max {
        rLimit.Max = 9999
        rLimit.Cur = 9999
        err = syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit)
        if err != nil {
            log.Printf("Unable to increase number of open files limit", err)
        }
    }
}

func main() {
    setRlimit()
    s := &http.Server{
        Addr:         ":59059",
        ReadTimeout:  7 * time.Second,
        WriteTimeout: 7 * time.Second,
    }
    http.HandleFunc("/http", MonitorHttp)

    log.Fatal(s.ListenAndServe())
}
Tom
  • 1,387
  • 3
  • 19
  • 30
user587170
  • 221
  • 2
  • 3
  • 4
    why are you building the entire client, dialer, transport, etc stack for each call? why not use a single client for everything? the client does connection pooling and recycling, etc. – Not_a_Golfer Dec 06 '15 at 20:15
  • 8
    You have no timeout for your requests, so any request that hangs will leave an open connection. You also have disabled TCP keepalives, so broken connections may never be detected. – JimB Dec 07 '15 at 01:47
  • @JimB thanks for your answer. – user587170 Dec 07 '15 at 02:25
  • 2
    Other possible issue in the code: Your error handling isn't correct, and is very verbose for the sake of just changing the error string slightly. You're sending Connection: close a second time for no reason. You have read/write timeouts on the server side, but no way to exit the handler early (another place that may be keeping sockets from closing). You shouldn't create new Transports (or Clients) for each call, and use the DefaultTransport unless you have a reason to override it. The code samples you've cited in comments aren't really correct, refer to the official docs. – JimB Dec 07 '15 at 14:46
  • 2
    Indeed, as @JimB suspects, 'lsof' prints "can't identify protocol" for half-open sockets. – BadZen Nov 07 '16 at 15:18
  • 4
    Possible duplicate of [Seeing too many lsof can't identify protocol](http://stackoverflow.com/questions/7911840/seeing-too-many-lsof-cant-identify-protocol) – Kerem Feb 19 '17 at 14:46
  • @JimB Why don't you post an answer? This is AFAIK, the top voted "unanswered" go question. – Milo Christiansen Oct 09 '17 at 23:11

2 Answers2

2

There are couple of points here.

I was unable to reproduce your behavior, anyway, can't identify protocol is usually tied to sockets not being properly closed.

Some commenters suggested you don't have to create http client inside each handler - that's true. Simply create it once and reuse.

Second, I'm not sure why are you creating your own http.Client struct and why you're disabling keepalives. Can't you just go with http.Get ? Simpler code is easier to debug.

Third, not sure why are you overwriting transport.Dial function. Even if you must do it, the documentation (for Go 1.9.2) says:

% go doc http.transport.dial
type Transport struct {
    // Dial specifies the dial function for creating unencrypted TCP
    connections.
    //
    // Deprecated: Use DialContext instead, which allows the transport
    // to cancel dials as soon as they are no longer needed.
    // If both are set, DialContext takes priority.
    Dial func(network, addr string) (net.Conn, error)

That comment about deprecation and lack of dials reuse may point to the source of your problems.

To sum up, when in your shoes, I'd do two things:

  • move client creation to the code which executes once, or just use default client with http.Get
  • I'd clean up this thing with overwriting default transport fields, if you must do it then I'd use DialContext as suggested.

Good luck.

Wojciech Kaczmarek
  • 2,173
  • 4
  • 23
  • 40
0

I couldn't reproduce the issue. But here are my 2 cents (no pun intended)

  1. Simmilar issue was found in SockJS-node noticed in an article https://idea.popcount.org/2012-12-09-lsof-cant-identify-protocol/ according to this issue was observed on FreeBSD. But the issue was "websockets are not bieng properly cleaned up"
  2. Another test test I would like ou to do if you still have hands on same environment. If possible post wireshark logs. just to confirm there are not subtle things in network frames which may have caused this.

I am sorry I cann't install Go 1.5.1 just to reproduce this issue. Hope this was helpful.

Devidas
  • 2,479
  • 9
  • 24