UART high write latency

I’m experiencing unexpectedly high UART write latency. In an ideal case, we’d expect a message of 109 bytes at a baud rate of 230400 to take ~4ms. I’m getting ~15ms. A shorter message of 12 bytes takes ~11ms. Unexpectedly high latency is consistent across different baud rates, rates of sending, message sizes, and two separate UART ports. The time is measured from when we start writing to the /dev/ttyMSM* port and until tcdrain from termios returns.

Following advice from this discussion, I tried setting low_latency flag and increasing the kernel timer tick from 250 to 1000. Increasing the kernel timer tick did help quite a bit, but the latency is still not quite there. It now takes ~9ms to write 109 bytes at a baud rate of 230400. Is it possible that extra latency consists of more than a single kernel tick? I’ve also tried padding the messages to match the UART FIFO size, but that didn’t have an effect.

The kernel version is qcomlt-4.14. I’m attaching the BLSP configuration:

diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
index 25849a17fbe6..6d3943a7c30b 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
@@ -23,12 +23,13 @@
 / {
 	aliases {
 		serial0 = &blsp2_uart1;
-		serial1 = &blsp2_uart2;
+		serial1 = &blsp2_uart9;
 		serial2 = &blsp1_uart1;
+		serial3 = &blsp2_uart11;
 		i2c0	= &blsp1_i2c2;
 		i2c1	= &blsp2_i2c1;
 		i2c2	= &blsp2_i2c0;
		i2c3	= &blsp2_i2c5;
 		spi0	= &blsp1_spi0;
 		spi1	= &blsp2_spi5;
 	};
@@ -106,22 +107,101 @@
 			status = "okay";
 		};
+		serial@075b3000 {
+			label = "LS-UART2";
+			status = "okay";
+			pinctrl-names = "default", "sleep";
+			pinctrl-0 = <&blsp2_uart11_default>;
+			pinctrl-1 = <&blsp2_uart11_sleep>;
+		};
+
+		serial@075b1000 {
+			label = "LS-UART0";
+			status = "okay";
+			pinctrl-names = "default", "sleep";
+			pinctrl-0 = <&blsp2_uart9_default>;
+			pinctrl-1 = <&blsp2_uart9_sleep>;
+		};
+
 		spi@07575000 {
 		/* On Low speed expansion */
 			label = "LS-SPI0";
 			status = "okay";
 		};
diff --git a/arch/arm64/boot/dts/qcom/msm8996-pins.dtsi b/arch/arm64/boot/dts/qcom/msm8996-pins.dtsi
index 1759e552e3eb..aecf9e684072 100644
--- a/arch/arm64/boot/dts/qcom/msm8996-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996-pins.dtsi
@@ -93,6 +93,54 @@
 		};
 	};
 
+	blsp2_uart11_default: blsp2_uart11 {
+		pinmux {
+			function = "blsp_uart11";
+			pins = "gpio58", "gpio59", "gpio60", "gpio61";
+		};
+		pinconf {
+			pins = "gpio58", "gpio59", "gpio60", "gpio61";
+			drive-strength = <16>;
+			bias-disable;
+		};
+	};
+
+	blsp2_uart11_sleep: blsp2_uart11_sleep {
+		pinmux {
+			function = "gpio";
+			pins = "gpio58", "gpio59", "gpio60", "gpio61";
+		};
+		pinconf {
+			pins = "gpio58", "gpio59", "gpio60", "gpio61";
+			drive-strength = <2>;
+			bias-disable;
+		};
+	};
+
+	blsp2_uart9_default: blsp2_uart9 {
+		pinmux {
+			function = "blsp_uart9";
+			pins = "gpio49", "gpio50", "gpio51", "gpio52";
+		};
+		pinconf {
+			pins = "gpio49", "gpio50", "gpio51", "gpio52";
+			drive-strength = <16>;
+			bias-disable;
+		};
+	};
+
+	blsp2_uart9_sleep: blsp2_uart9_sleep {
+		pinmux {
+			function = "gpio";
+			pins = "gpio49", "gpio50", "gpio51", "gpio52";
+		};
+		pinconf {
+			pins = "gpio49", "gpio50", "gpio51", "gpio52";
+			drive-strength = <2>;
+			bias-disable;
+		};
+	};
+
 	blsp1_i2c2_default: blsp1_i2c2_default {
 		pinmux {
 			function = "blsp_i2c3";
diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 6dfd248a270e..6f75493c677d 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -714,6 +714,53 @@
 			#address-cells = <1>;
 			#size-cells = <0>;
 			status = "disabled";
+		blsp2_uart9: serial@075b1000 {
+			compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
+			reg = <0x075b1000 0x1000>;
+			interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&gcc GCC_BLSP2_UART3_APPS_CLK>,
+			<&gcc GCC_BLSP2_AHB_CLK>;
+			clock-names = "core", "iface";
+			status = "disabled";
+		};
+
+		blsp2_uart11: serial@075b3000 {
+			compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
+			reg = <0x075b3000 0x1000>;
+			interrupts = <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&gcc GCC_BLSP2_UART5_APPS_CLK>,
+			<&gcc GCC_BLSP2_AHB_CLK>;
+			clock-names = "core", "iface";
+			status = "disabled";
 		};
 
 		blsp2_i2c0: i2c@075b5000 {
@@ -762,15 +809,15 @@
 			status = "disabled";
 		};
 
-		blsp2_uart2: serial@75b1000 {
-			compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
-			reg = <0x075b1000 0x1000>;
-			interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>;
-			clocks = <&gcc GCC_BLSP2_UART3_APPS_CLK>,
-				 <&gcc GCC_BLSP2_AHB_CLK>;
-			clock-names = "core", "iface";
-			status = "disabled";
-		};
 
 		blsp1_i2c2: i2c@07577000 {
 			compatible = "qcom,i2c-qup-v2.2.1";
diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c
index 1db79ee8a886..14bc0ff6a81a 100644
--- a/drivers/tty/serial/msm_serial.c
+++ b/drivers/tty/serial/msm_serial.c
@@ -1560,6 +1560,15 @@ static struct msm_port msm_uart_ports[] = {
 			.line = 2,
 		},
 	},
+	{
+		.uart = {
+			.iotype = UPIO_MEM,
+			.ops = &msm_uart_pops,
+			.flags = UPF_BOOT_AUTOCONF,
+			.fifosize = 64,
+			.line = 3,
+		},
+	}
 };
 
 #define UART_NR	ARRAY_SIZE(msm_uart_ports)

This looks like is a high per-operation fixed cost (high fixed cost would lead us to predict 12 bytes is ~4ms faster than 109 bytes and it is).

ftrace might give you a better idea what is going on here.

Even just observing the tracepoints (interrupt firing versus when your measurement task is scheduled, thread schedule, etc) would give you some clues about where the system is spending its time. If that is not enough to make things clear then you can do full function tracing to dig deeper.

I’ve tried enabling CONFIG_FTRACE when building kernel (it was disabled by default) but then I got kernel panic when booting. Also tried with different ftrace subconfigs enabled/disabled without luck.

It does seem the way I was measuring write latency was flawed though. I measured round-trip latency instead and got some surprising results (my setup is μSOM → FTDI UART to USB → PC). Round-trip latency of 109 bytes message at 921600 baud rate was ~14ms with occasional drops down to 8ms. However, removing the tcdrain call (call that explicitly waits for transmission to happen) resulted in a consistent round-trip latency of ~7-8ms. Meaning that most of the time we were still waiting in tcdrain call, when the message echo already arrived. Increasing kernel tick rate only decreased the tcdrain call duration but didn’t significantly decrease the actual round-trip latency.

The current latency still isn’t ideal but it ended up being good enough for my use case.
It’s not as trivial to see where the extra latency comes from now, since the problem could be at any of the three communication points. I’m fairly confident FTDI chip is well set up for low latency (event characters are enabled, so the buffer gets flushed at the end of every message).

Very odd that you could not enable ftrace. It doesn’t really do anything until it is enabled post-boot (except insert a nop at the end of every kernel function) so I’m surprised it regresses the boot. I’ve not seen anything like that before (you’re not building the kernel with clang are you?).

Anyhow… I’m glad you got it down low enough to make progress.