diff --git a/kstool/kstool.cpp b/kstool/kstool.cpp
index dedbdfdc6f7fc6f1645568d24e1281065ed177e7..527dbd873f000e8efa5cb7042e7ae8dd71a6c9f8 100644
--- a/kstool/kstool.cpp
+++ b/kstool/kstool.cpp
@@ -68,6 +68,8 @@ static void usage(char *prog)
     if (ks_arch_supported(KS_ARCH_SYSTEMZ)) {
         printf("        systemz:   SystemZ (S390x)\n");
     }
+
+    printf("\n");
 }
 
 int main(int argc, char **argv)
@@ -85,7 +87,7 @@ int main(int argc, char **argv)
         mode = argv[1];
 
         int flags;
-        if (-1 == (flags = fcntl(STDIN_FILENO, F_GETFL, 0)))
+        if ((flags = fcntl(STDIN_FILENO, F_GETFL, 0)) == -1)
             flags = 0;
 
         fcntl(STDIN_FILENO, F_SETFL, flags | O_NONBLOCK);
@@ -93,7 +95,7 @@ int main(int argc, char **argv)
         size_t index = 0;
 
         char buf[1024];
-        while( fgets(buf, sizeof(buf), stdin) ) {
+        while(fgets(buf, sizeof(buf), stdin)) {
             input = (char*)realloc(assembly, index + strlen(buf));
             if (!input) {
                 printf("Failed to allocate memory.");
@@ -104,7 +106,13 @@ int main(int argc, char **argv)
             index += strlen(buf);
         }
 
+        fcntl(STDIN_FILENO, F_SETFL, flags);
+
         assembly = input;
+        if (!assembly) {
+            usage(argv[0]);
+            return -1;
+        }
     } else if (argc == 3) {
 #else
     if (argc == 3) {
diff --git a/suite/test_roundtrips.py b/suite/test_roundtrips.py
index 8075e7d6ede36fac31c1c7a8965f9cd67bb2653e..ccbd2c07cf9d180f7ede5bffbc80b6b3244e97e8 100755
--- a/suite/test_roundtrips.py
+++ b/suite/test_roundtrips.py
@@ -45,6 +45,8 @@ roundtrip_tests = [
     ("x64", "vaddps zmm3 {k7} {z}, zmm18, dword ptr [r15 + xmm6*4 + 0x1f0]{1to16}"),
     ("x64", "vcvtsd2si edi, qword ptr [rbp + 0x18]"),
     ("x64", "vcvtss2si r9d, dword ptr [rdx - 0x64]"),
+    ("x64", "vfmsub213ps zmm18, zmm19, dword ptr [rsi + 0x64]{1to16}"),
+    ("x64", "vfmsub213sd xmm16, xmm16, xmmword ptr [rdi + 0x2d8]"),
     ("x64", "vmaxpd zmm27 {k3}, zmm9, zmmword ptr [r15 - 0xc00]"),
     ("x64", "vmaxps zmm0 {k2} {z}, zmm8, zmmword ptr [r9 - 0x1b40]"),
     ("x64", "vminps zmm2 {k6} {z}, zmm18, dword ptr [r14 - 0x1e4]{1to16}"),
@@ -52,11 +54,14 @@ roundtrip_tests = [
     ("x64", "vpermi2pd zmm30 {k2} {z}, zmm19, zmmword ptr [rcx + 0x200]"),
     ("x64", "vpermi2ps zmm3 {k3} {z}, zmm28, zmmword ptr [r8 + 0xcc0]"),
     ("x64", "vpermt2d zmm14 {k7}, zmm6, zmmword ptr [r13 + 0x1280]"),
+    ("x64", "vpermt2pd zmm12 {k2}, zmm1, zmmword ptr [r8 - 0x1ec0]"),
     ("x64", "vpmaxsd zmm31 {k5}, zmm8, zmmword ptr [r8 - 0x780]"),
     ("x64", "vpmaxsq zmm16 {k7}, zmm2, qword ptr [rsi - 0x228]{1to8}"),
     ("x64", "vpminsd zmm4 {k1} {z}, zmm25, zmmword ptr [r8 + 0x1d80]"),
     ("x64", "vpminsq zmm28 {k5}, zmm28, zmmword ptr [r15 + xmm3 - 0x1400]"),
     ("x64", "vpminuq zmm21 {k2}, zmm15, zmmword ptr [r10 - 0x1300]"),
+    ("x64", "vpmovqw xmm23 {k6}, zmm11"),
+    ("x64", "vpmuludq zmm2 {k7} {z}, zmm31, zmmword ptr [rsi - 0xe80]"),
     ("x64", "vpord zmm15 {k2} {z}, zmm8, zmmword ptr [rdi + r11*4 + 0xc40]"),
     ("x64", "vporq zmm29 {k5} {z}, zmm2, zmmword ptr fs:[r11 - 0x1c40]"),
     ("x64", "vpsllq zmm21 {k2}, zmm28, xmmword ptr [rcx - 0x2b0]"),
@@ -65,6 +70,7 @@ roundtrip_tests = [
     ("x64", "vpsubq zmm19 {k4}, zmm24, zmmword ptr [rbx + 0x10c0]"),
     ("x64", "vpxord zmm12 {k5} {z}, zmm8, zmmword ptr [rbp + 0x740]"),
     ("x64", "vpxorq zmm21 {k2}, zmm1, zmmword ptr [rbx - 0x1180]"),
+    ("x64", "vsubpd zmm6 {k1}, zmm19, zmmword ptr [rdi - 0x1100]"),
     ("x64", "xchg rax, rax"),
     ("x64", "xor qword ptr [esi + 0x1df54066], 0x6c"),
 ]