从源码解析Go exec timeout 实现机制

从源码解析Go exec timeout 实现机制

1. 背景

环境:golang 1.9,drawn

测试使用golang exec 执行命令,并配置过期时间,测试脚本如下。

现象:执行脚本后,到超时时间后并为超时退出,反而阻塞住了

func TestExecWithTimeout(t *testing.T) {
        ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
        defer cancel()

        start := time.Now()
        cmd := exec.CommandContext(ctx, "sh", "-c", "echo start && sleep 10 && echo end.")
        out, err := cmd.CombinedOutput()
        fmt.Printf("error: [%v]\n", err)
        fmt.Printf("out: [%v]\n", string(out))

        if ctx.Err() == context.DeadlineExceeded {
                fmt.Printf("ctx.Err: [%v]\n", ctx.Err())
        }

        fmt.Printf("process end: %v", time.Since(start))
}
error: [signal: killed]
out: [start
]
ctx.Err: [context deadline exceeded]
process end: 10.010193583sPASS

在用ps查看进程状态,发现,超时时间到达后,sh进程就被kill了,而由ssh启动的sleep 10命令还在进行中,且该进程被1号进程接管,变成了僵尸进程。当我们手动执行kill命令后,程序退出。

~/workpro//mkskit ❯ ps -ef | grep "sleep"
  501 85600 85597   0  3:34下午 ??         0:00.00 sh -c echo start && sleep 20 && echo end.
  501 85601 85600   0  3:34下午 ??         0:00.00 sleep 20
  501 85608 17687   0  3:34下午 ttys004    0:00.00 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn --exclude-dir=.idea --exclude-dir=.tox sleep
~/workpro//mkskit ❯ ps -ef | grep "sleep"
  501 85601     1   0  3:34下午 ??         0:00.00 sleep 20
  501 85652 17687   0  3:35下午 ttys004    0:00.00 grep --color=auto --exclude-dir=.bzr --exclude-dir=CVS --exclude-dir=.git --exclude-dir=.hg --exclude-dir=.svn --exclude-dir=.idea --exclude-dir=.tox sleep

2. 原因分析

从现象推测,exec 创建了两个进程,超时后 sh进程退出,但是sleep进程还在变成了僵尸进程,sleep进程还未退出,导致整改go主进程阻塞。

2.1 源码-分析

// CombinedOutput runs the command and returns its combined standard
// output and standard error.
func (c *Cmd) CombinedOutput() ([]byte, error) {
        if c.Stdout != nil {
                return nil, errors.New("exec: Stdout already set")
        }
        if c.Stderr != nil {
                return nil, errors.New("exec: Stderr already set")
        }
        var b bytes.Buffer
        c.Stdout = &b
        c.Stderr = &b
        err := c.Run()
        return b.Bytes(), err
}

// 执行命令,并等待命令执行完成
func (c *Cmd) Run() error {
        if err := c.Start(); err != nil {
                return err
        }
        return c.Wait()
}

CombinedOutput 方法会初始化一个字节缓冲区,并标准输出和标准错误导向该缓冲区。然后启动执行命令。

Run方法中调用Start和Wait方法:

  • Start方法用于启动子进程,启动后立即返回
  • Wait方法则阻塞,等待子进程结束并回收资源

2.1.1 Wait 方法

// Wait waits for the command to exit and waits for any copying to
// stdin or copying from stdout or stderr to complete.
//
// The command must have been started by Start.
//
// The returned error is nil if the command runs, has no problems
// copying stdin, stdout, and stderr, and exits with a zero exit
// status.
//
// If the command fails to run or doesn't complete successfully, the
// error is of type *ExitError. Other error types may be
// returned for I/O problems.
//
// If any of c.Stdin, c.Stdout or c.Stderr are not an *os.File, Wait also waits
// for the respective I/O loop copying to or from the process to complete.
//
// Wait releases any resources associated with the Cmd.
func (c *Cmd) Wait() error {
        if c.Process == nil {
                return errors.New("exec: not started")
        }
        if c.ProcessState != nil {
                return errors.New("exec: Wait was already called")
        }
        state, err := c.Process.Wait()
        if err == nil && !state.Success() {
                err = &ExitError{ProcessState: state}
        }
        c.ProcessState = state

        // Wait for the pipe-copying goroutines to complete.
        var copyError error
        for range c.goroutine {
                if err := <-c.goroutineErrs; err != nil && copyError == nil {
                        copyError = err
                }
        }
        c.goroutine = nil // Allow the goroutines' closures to be GC'd.

        if c.ctxErr != nil {
                interruptErr := <-c.ctxErr
                // If c.Process.Wait returned an error, prefer that.
                // Otherwise, report any error from the interrupt goroutine.
                if interruptErr != nil && err == nil {
                        err = interruptErr
                }
        }
        // Report errors from the copying goroutines only if the program otherwise
        // exited normally on its own. Otherwise, the copying error may be due to the
        // abnormal termination.
        if err == nil {
                err = copyError
        }

        c.closeDescriptors(c.closeAfterWait)
        c.closeAfterWait = nil

        return err
}

根据debug可知,阻塞发生在第34行, err := <-c.goroutineErrs;这句,从goroutineErrs中读取错误信息并返回第一次错误给调用者。而 <-ch从通道中获取数据阻塞的原因只有发送发未准备好,那么goroutineErr对应的发送方是谁呢?

2.1.2 Start 方法

// Start starts the specified command but does not wait for it to complete.
//
// If Start returns successfully, the c.Process field will be set.
//
// After a successful call to Start the Wait method must be called in
// order to release associated system resources.
func (c *Cmd) Start() error {
        if c.Path == "" && c.Err == nil && c.lookPathErr == nil {
                c.Err = errors.New("exec: no command")
        }
        if c.Err != nil || c.lookPathErr != nil {
                c.closeDescriptors(c.closeAfterStart)
                c.closeDescriptors(c.closeAfterWait)
                if c.lookPathErr != nil {
                        return c.lookPathErr
                }
                return c.Err
        }
        if runtime.GOOS == "windows" {
                lp, err := lookExtensions(c.Path, c.Dir)
                if err != nil {
                        c.closeDescriptors(c.closeAfterStart)
                        c.closeDescriptors(c.closeAfterWait)
                        return err
                }
                c.Path = lp
        }
        if c.Process != nil {
                return errors.New("exec: already started")
        }
        if c.ctx != nil {
                select {
                case <-c.ctx.Done():
                        c.closeDescriptors(c.closeAfterStart)
                        c.closeDescriptors(c.closeAfterWait)
                        return c.ctx.Err()
                default:
                }
        }

        c.childFiles = make([]*os.File, 0, 3+len(c.ExtraFiles))
        type F func(*Cmd) (*os.File, error)
        for _, setupFd := range []F{(*Cmd).stdin, (*Cmd).stdout, (*Cmd).stderr} {
                fd, err := setupFd(c)
                if err != nil {
                        c.closeDescriptors(c.closeAfterStart)
                        c.closeDescriptors(c.closeAfterWait)
                        return err
                }
                c.childFiles = append(c.childFiles, fd)
        }
        c.childFiles = append(c.childFiles, c.ExtraFiles...)

        env, err := c.environ()
        if err != nil {
                return err
        }

        c.Process, err = os.StartProcess(c.Path, c.argv(), &os.ProcAttr{
                Dir:   c.Dir,
                Files: c.childFiles,
                Env:   env,
                Sys:   c.SysProcAttr,
        })
        if err != nil {
                c.closeDescriptors(c.closeAfterStart)
                c.closeDescriptors(c.closeAfterWait)
                return err
        }

        c.closeDescriptors(c.closeAfterStart)

        // Don't allocate the goroutineErrs channel unless there are goroutines to fire.
        if len(c.goroutine) > 0 {
                errc := make(chan error, len(c.goroutine))
                c.goroutineErrs = errc
                for _, fn := range c.goroutine {
                        go func(fn func() error) {
                                errc <- fn()
                        }(fn)
                }
        }

        c.ctxErr = c.watchCtx()

        return nil
}

从第74-82行可以看到,创建了一个len(c.goroutine)的channel, 然后启动goroutine执行c.goroutine中的方法并将错误写入errc,由wait的现象可知,应该是这个fn()调用阻塞了。

继续追踪,c.goroutine是在哪里被赋值的。同样是在Start方法中

主要看第43行,学习到了,原来函数内还能定义类型。这里主要是调用了三个函数来初始化stdin, stderr, stdout.

这里主要先看下stdout、stderr

func (c *Cmd) stdout() (f *os.File, err error) {
        return c.writerDescriptor(c.Stdout)
}

func (c *Cmd) stderr() (f *os.File, err error) {
    // 如果stdrr 和 stdout 是同一个输出目标 则跳过
        if c.Stderr != nil && interfaceEqual(c.Stderr, c.Stdout) {
                return c.childFiles[1], nil
        }
        return c.writerDescriptor(c.Stderr)
}

func (c *Cmd) writerDescriptor(w io.Writer) (f *os.File, err error) {
    // case1
        if w == nil {
                f, err = os.OpenFile(os.DevNull, os.O_WRONLY, 0)
                if err != nil {
                        return
                }
                c.closeAfterStart = append(c.closeAfterStart, f)
                return
        }

    // case2
        if f, ok := w.(*os.File); ok {
                return f, nil
        }

    // case3
        pr, pw, err := os.Pipe()
        if err != nil {
                return
        }

        c.closeAfterStart = append(c.closeAfterStart, pw)
        c.closeAfterWait = append(c.closeAfterWait, pr)
        c.goroutine = append(c.goroutine, func() error {
                _, err := io.Copy(w, pr)
                pr.Close() // in case io.Copy stopped due to write error
                return err
        })
        return pw, nil
}

两个函数都调用了writerDescriptor,看代码主要由三个分支逻辑

  • case1:如果没有制定stderr或者stdout,就直接写入os.DevNull 即 /dev/null
  • case2:如果制定的stderr或stdout 是*os.File类型也直接返回,会把输出写入文件
  • case3:最后一种情况,创建管道,返回写端点,在Start函数中的第59行,子进程和管道fd进行绑定。关键在第38行,goroutine中绑定了一个func(),从管道的读端点读取数据并copy到指定的stderr或者stdout(均实现Writer)。
// Implementations must not retain p.
type Writer interface {
        Write(p []byte) (n int, err error)
}

io.Copy方法会一直阻塞到reader被关闭才会返回,这也是为什么会产生阻塞到原因了。在Wait方法到注释中可以得知

// If any of c.Stdin, c.Stdout or c.Stderr are not an *os.File, Wait also waits
// for the respective I/O loop copying to or from the process to complete.
//
// Wait releases any resources associated with the Cmd.
func (c *Cmd) Wait() error {}

到这里了解到了阻塞到根本原因,那超时子进程被kill是在哪里触发的。在看Start方法时,可以看到最后一行有一个watchCtx调用

// watchCtx conditionally starts a goroutine that waits until either c.ctx is
// done or c.Process.Wait has completed (called from Wait).
// If c.ctx is done first, the goroutine terminates c.Process.
//
// If a goroutine was started, watchCtx returns a channel on which its result
// must be received.
func (c *Cmd) watchCtx() <-chan error {
        if c.ctx == nil {
                return nil
        }

        errc := make(chan error)
        go func() {
                select {
                case errc <- nil:
                        return
                case <-c.ctx.Done():
                }

                var err error
                if killErr := c.Process.Kill(); killErr == nil {
                        // We appear to have successfully delivered a kill signal, so any
                        // program behavior from this point may be due to ctx.
                        err = c.ctx.Err()
                } else if !errors.Is(killErr, os.ErrProcessDone) {
                        err = wrappedError{
                                prefix: "exec: error sending signal to Cmd",
                                err:    killErr,
                        }
                }
                errc <- err
        }()

        return errc
}

可以看到,启动了一个goroutine,绑定了一个 error chan,selec监听ctx的状态,当ctx超时或者提前cancel了,则会出发process.kill,将子进程关闭。

正常情况下,当ctx超时,子进程会被kill,此时管道的写入端点自然会被关闭,io.Copy则会在copy完成后正常返回,给e.errch中发送一个nil,Wait方法则中c.errch中读取到nil。在Wait方法中的第26、40、44行可以看到,err最终被赋值为一个state第结构体对象。

但是在开始的demo中,除了sh这个子进程之外还启动了一个sleep子进程,context超时后,sleep进程依旧在运行,并持有管道的写端点,导致io.Copy阻塞。

2.2 相关流程

从源码解析Go exec timeout 实现机制

3. 解决方案

根据上述分析可得,进入case3且产生了子进程才会导致阻塞。那么只要不满足其中一个条件即可。

3.1 使用*os.File类型接收输入输出

可避免阻塞,但存在的问题:

  • 需要额外处理输出,如: 从文件读取并写入到需要的地方
  • 程序退出后,子子进程被1号进程托管称为僵尸进程
func TestExecUseFileOutput(t *testing.T) {
        ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
        defer cancel()

        cmd := exec.CommandContext(ctx, "sh", "-c", "echo start && sleep 15 && echo end.")
        combinedOutput, err := ioutil.TempFile("", "stdouterr")
        if err != nil {
                fmt.Println(err)
                return
        }

        defer func() {
                _ = os.Remove(combinedOutput.Name())
        }()
        cmd.Stdout = combinedOutput
        cmd.Stderr = combinedOutput
        err = cmd.Run()
        if err != nil {
                fmt.Println(err)
                return
        }

        _, err = combinedOutput.Seek(0, 0)
        var b bytes.Buffer
        _, err = io.Copy(&b, combinedOutput)
        if err != nil {
                fmt.Println(err)
                return
        }

        fmt.Println("output:", b.String())
        fmt.Printf("ctx.Err: [%v]\n", ctx.Err())
        fmt.Printf("error: [%v]\n", err)
}

3.2 避免产生子进程

linux执行脚本的5种方式

  • 使用绝对路径执行:/root/sleep.sh
  • 使用相对路径:./sleep.sh 需要x权限
  • 使用sh或者bash命令来执行:bash /root/sleep.sh
  • 使用.(空格)脚本名称来执行: . ./sleep.sh
  • 使用source来执行(一般用来生效配置文件): source /root/sleep.sh

前三种方式都会产生bash 进程和sleep进程,后两种则只会产生sleep进程。

因为go中没有shell环境,如果执行复杂的命令,肯定只是用bash 或者 sh方式运行,肯定会产生一个新的进程,so这个方法无效

3.3 sh -c

bash -c command / sh -c command 方式执行单条命令的时候有相关的优化,是不会产生多个进程的,因此如果将demo中的复杂命令或者脚本拆分成多个命令执行也可以实现,但这种方式不够优雅。

原理:

单条命令时:启动bash进程后发现是一个简单的命令,在不fork新进程的情况下直接调用exec执行命令,然后将子shell替换为sleep命令。

多条命令时:需要使用子shell 来处理 && 操作符,它需要等待第一个命令终止的SIGCHLD,然后根据第一个命令的exit status 决定是否需要运行第二个命令,因此不能将子shell替换为sleep命令。

具体见 shell.c 第 1370 行

3.4 kill -pid kill进程组

linux kill(2) 指定 pid 为负数时会给这个进程组中的所有进程发送信号

If pid is less than -1, then sig is sent to every process in the        
process group whose ID is -pid.
func TestExecuteInOnPgid(t *testing.T) {
        ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
        defer cancel()

        cmd := exec.CommandContext(ctx, "sh", "-c", "echo start && sleep 15 && echo end.")
        cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
        go func() {
                select {
                case <-ctx.Done():
                        if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil {
                                fmt.Println("kill failed: ", err)
                        }
                }
        }()

        output, err := cmd.CombinedOutput()
        if err != nil {
                fmt.Println(err)
                return
        }

        fmt.Println("output:", string(output))
        fmt.Printf("ctx.Err : [%v]\n", ctx.Err())
        fmt.Printf("error   : [%v]\n", err)
}

3.5 社区提案

该问题其实很早就存在了,最早可以追溯到这个 2017 年的 Issue #23019,不过为了保持向后兼容,在方案上一直没有达成共识,最新提案见这个 Issue #50436,根据 #53400 中的最新消息,该提案可能会在 Go 1.20 中实现。

大致方案为在 exec.Cmd 中添加一个 Interrupt(os.Signal) 字段,在 context 超时后将这个信号发送给子进程以关闭所有子进程。

 // Context is the context that controls the lifetime of the command
        // (typically the one passed to CommandContext).
        Context context.Context

        // If Interrupt is non-nil, Context must also be non-nil and Interrupt will be
        // sent to the child process when Context is done.
        //
        // If the command exits with a success code after the Interrupt signal has
        // been sent, Wait and similar methods will return Context.Err()
        // instead of nil.
        //
        // If the Interrupt signal is not supported on the current platform
        // (for example, if it is os.Interrupt on Windows), Start may fail
        // (and return a non-nil error).
        Interrupt os.Signal

        // If WaitDelay is non-zero, the command's I/O pipes will be closed after
        // WaitDelay has elapsed after either the command's process has exited or
        // (if Context is non-nil) Context is done, whichever occurs first.
        // If the command's process is still running after WaitDelay has elapsed,
        // it will be terminated with os.Kill before the pipes are closed.
        //
        // If the command exits with a success code after pipes are closed due to
        // WaitDelay and no Interrupt signal has been sent, Wait and similar methods
        // will return ErrWaitDelay instead of nil.
        //
        // If WaitDelay is zero (the default), I/O pipes will be read until EOF,
        // which might not occur until orphaned subprocesses of the command have
        // also closed their descriptors for the pipes.
        WaitDelay time.Duration

4. 总结

现象

使用 os/exec 执行 shell 脚本并设置超时时间,然后到超时时间之后程序并未超时退出,反而一直阻塞。

原因

os/exec 包执行命令时会创建子进程,通过管道连接子进程以收集命令执行结果,goroutine 从管道中读取命令输出,超时后会 kill 掉子进程,从而关闭管道,管道被关闭后 goroutine 则自动退出。

如果存在子子进程,占有管道则会导致 kill 掉子进程后管道依旧未能释放,读取输出的 goroutine 被阻塞,最终导致程序超时后也无法返回。

解决

package ssh

import "syscall"

// 实现Cancel方法
func (e *SomeExecuteCmdForward) Cancel() {
        if pgid := -e.pid; pgid < 0 {
                _ = syscall.Kill(pgid, syscall.SIGKILL)
                e.pid = 0
        }
}

// cmd.SysProcAttr = newSysProcAttr()
func newSysProcAttr() *syscall.SysProcAttr {
        return &syscall.SysProcAttr{
                Setpgid: true,
        }
}

kill -- -pid

5. 思考

为什么在start 进程起来之后就把文件句柄close了c.closeDescriptors(c.closeAfterStart),管道还可以继续读写?

在close之前已经将文件句柄传递给子进程了,相当于此时fd的引用计数为2,start调用的是关闭主进程引用的文件句柄,相当于引用计数-1。直到管道写入端close管道后,管道读取端就可以收到管道被关闭的信号,结束读取。

追踪closeDescriptors底层调用函数,可以看到也是进行引用计数-1,直到为0就关闭。

// Close closes the FD. The underlying file descriptor is closed by the
// destroy method when there are no remaining references.
func (fd *FD) Close() error {
        if !fd.fdmu.increfAndClose() {
                return errClosing(fd.isFile)
        }

        // Unblock any I/O.  Once it all unblocks and returns,
        // so that it cannot be referring to fd.sysfd anymore,
        // the final decref will close fd.sysfd. This should happen
        // fairly quickly, since all the I/O is non-blocking, and any
        // attempts to block in the pollDesc will return errClosing(fd.isFile).
        fd.pd.evict()

    // 关键代码
        // The call to decref will call destroy if there are no other
        // references.
        err := fd.decref()

        // Wait until the descriptor is closed. If this was the only
        // reference, it is already closed. Only wait if the file has
        // not been set to blocking mode, as otherwise any current I/O
        // may be blocking, and that would block the Close.
        // No need for an atomic read of isBlocking, increfAndClose means
        // we have exclusive access to fd.
        if fd.isBlocking == 0 {
                runtime_Semacquire(&fd.csema)
        }

        return err
}

// decref removes a reference from fd.
// It also closes fd when the state of fd is set to closed and there
// is no remaining reference.
func (fd *FD) decref() error {
        if fd.fdmu.decref() {
                return fd.destroy()
        }
        return nil
}


https://stackoverflow.com/questions/70175281/what-is-the-purpose-of-closeafterstart-in-exec

6. exec案例

6.1 实时读取标准输出

func TestReadStdoutRealTime(t *testing.T) {
        cmd := exec.Command("ping", "-c", "5", "192.168.0.1")
        stdout, _ := cmd.StdoutPipe()
        cmd.Start()

        collect := func(output io.Reader) error {
                scanner := bufio.NewScanner(output)
                scanner.Split(bufio.SplitFunc(bufio.ScanLines))
                for scanner.Scan() {
                        if scanner.Err() != nil {
                                fmt.Println(scanner.Err())
                                return scanner.Err()
                        }
                        line := scanner.Text()
                        fmt.Println(line)
                }

                return nil
        }

        if err := collect(stdout); err != nil {
                fmt.Println("collect stdout failed", err)
        }

        if err := cmd.Wait(); err != nil {
                fmt.Println("wait exec failed", err.Error())
        }
}

6.2 实时读取标准输出(错误用法)

func collector(ctx context.Context, output io.Reader) error {
        scanner := bufio.NewReader(output)
        for {
                // 当异步收集日志比较慢,但命令执行很快就退出后,会将pipe关闭,导致管道无法读取,
        // 报错 read |0: file already closed
                // time.Sleep(2 * time.Second)
                select {
                case <-ctx.Done():
                        if ctx.Err() != nil {
                                return ctx.Err()
                        }

                        return errors.New("process existed")
                default:
                        readStr, readErr := scanner.ReadString('\n')
                        if readErr != nil {
                                if readErr != io.EOF {
                                        return readErr
                                }

                                return nil
                        }
                        fmt.Println(readStr)
                }
        }
}

func TestReadStdoutWrongExample(t *testing.T) {
        ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
        defer cancel()
        cmd := exec.CommandContext(ctx, "ping", "-c", "5", "192.168.0.1")

        stdout, _ := cmd.StdoutPipe()
        cmd.Start()

        go func() {
                err := collector(ctx, stdout)
                if err != nil {
                        fmt.Println(err)
                }
        }()

        if err := cmd.Wait(); err != nil {
                fmt.Println("wait exec failed", err)
        }
}

7. 参考

Go exec 包执行命令超时失效问题分析及解决方案