Why does my root mount suddenly remount as read-only after a while?

I’m testing out NixOS and I’m having a strange issue. If I leave my PC for several hours (usually over night) and come back, my root partition is suddenly mounted in read-only mode. This renders my entire system immediately unusable. Most of my applications immediately crash, most terminal commands just fail with a generic “Input/Output error”, including mount, so I’m unable to remount as read-write. I’m not even able to reboot the system, so I have to shut it down by holding the power button.

I’ve looked for errors using journalctl -b -1 but there’s nothing, possibly because nothing can be logged when the filesystem is readonly…

When I boot the system, the root mount is again read-write:

/dev/mapper/main-nixos on / type ext4 (rw,relatime)
/dev/mapper/main-nixos on /nix/store type ext4 (ro,relatime)

Have I done something stupid when installing NixOS? When I try to Google this issue, most results seem to imply that the root partition is supposed to be read-only, so I’m suspecting I may have installed NixOS wrong somehow.

Would appreciate any theories or suggestions.

Here’s my hardware-configuration.nix:

# Do not modify this file!  It was generated by ‘nixos-generate-config’
# and may be overwritten by future invocations.  Please make changes
# to /etc/nixos/configuration.nix instead.
{ config, lib, pkgs, modulesPath, ... }:

{
  imports =
    [ (modulesPath + "/installer/scan/not-detected.nix")
    ];

  boot.initrd.availableKernelModules = [ "nvme" "xhci_pci" "ahci" "usbhid" "usb_storage" "sd_mod" ];
  boot.initrd.kernelModules = [ "dm-snapshot" "amdgpu" ];
  boot.kernelModules = [ "kvm-amd" ];
  boot.extraModulePackages = [ ];

  hardware.opengl.driSupport = true;
  hardware.opengl.driSupport32Bit = true;

  fileSystems."/" =
    { device = "/dev/disk/by-uuid/c1ccb066-7fa3-408a-a442-d42d2b55ab39";
      fsType = "ext4";
    };

  fileSystems."/boot" =
    { device = "/dev/disk/by-uuid/83D5-5214";
      fsType = "vfat";
    };

  swapDevices = [ ];

  # Enables DHCP on each ethernet and wireless interface. In case of scripted networking
  # (the default) this is the recommended approach. When using systemd-networkd it's
  # still possible to use this option, but it's recommended to use it in conjunction
  # with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
  networking.useDHCP = lib.mkDefault true;
  # networking.interfaces.enp4s0.useDHCP = lib.mkDefault true;
  # networking.interfaces.wlp5s0.useDHCP = lib.mkDefault true;

  nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
  hardware.cpu.amd.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
}

Here’s configuration.nix:

# Edit this configuration file to define what should be installed on
# your system.  Help is available in the configuration.nix(5) man page
# and in the NixOS manual (accessible by running ‘nixos-help’).

{ config, pkgs, ... }:

let
  unstable = import <nixos-unstable> { config = { allowUnfree = true; }; };

in
{
  imports =
    [
      # Include the results of the hardware scan.
      /etc/nixos/hardware-configuration.nix

      <home-manager/nixos>
    ];

  nix.settings.experimental-features = [ "nix-command" "flakes" ];

  # Bootloader.
  boot.loader.systemd-boot.enable = true;
  boot.loader.efi.canTouchEfiVariables = true;

  hardware.bluetooth.enable = true;

  networking.hostName = "cross"; # Define your hostname.

  networking.extraHosts = ''
    # REDACTED
  '';

  #networking.wireless.enable = true;  # Enables wireless support via wpa_supplicant.
  #networking.wireless.networks."Jedi Enclave 5G".psk = "3012197204";

  # Configure network proxy if necessary
  # networking.proxy.default = "http://user:password@proxy:port/";
  # networking.proxy.noProxy = "127.0.0.1,localhost,internal.domain";

  # Enable networking
  networking.networkmanager.enable = true;
  #networking.networkmanager.unmanaged = [ "wlp2s0" ];

  # Firewall
  networking.firewall = {
    allowedTCPPorts = [
      8384 2200 # Syncthing
      4321 # Astro JS dev server port
    ];
    allowedUDPPorts = [ 22000 21027 ]; # Syncthing
  };

  # Set your time zone.
  time.timeZone = "Europe/Oslo";

  # Select internationalisation properties.
  i18n.defaultLocale = "en_US.UTF-8";

  # Configure console keymap
  console.keyMap = "no";

  users.users.tomas = {
    isNormalUser = true;
    description = "Tomas Sandven";
    extraGroups = [ "networkmanager" "wheel" ];
    shell = pkgs.zsh;
    packages = with pkgs; [ ];
  };

  home-manager.users.tomas = { pkgs, ... }: {
    gtk.cursorTheme.package = pkgs.gnome.adwaita-icon-theme;
    gtk.cursorTheme.name = "Adwaita";
    gtk.cursorTheme.size = 24;
    home.stateVersion = "23.05";
    home.pointerCursor.package = pkgs.gnome.adwaita-icon-theme;
    home.pointerCursor.name = "Adwaita";
    home.pointerCursor.size = 24;
  };

  # Allow unfree packages
  nixpkgs.config.allowUnfree = true;
  nixpkgs.config.permittedInsecurePackages = [
    "openssl-1.1.1u" # DELETE ASAP
    "openssl-1.1.1v" # DELETE ASAP
  ];

  security.rtkit.enable = true;

  #
  # System packages
  #

  # Use Neovim nightly
  nixpkgs.overlays = [
    (import (builtins.fetchTarball {
      url = https://github.com/nix-community/neovim-nightly-overlay/archive/master.tar.gz;
    }))
  ];

  environment.systemPackages = with pkgs; [
    bash
    tmux
    bat
    curl
    curlie
    gcc
    fd
    ripgrep
    file
    pciutils # Provides "lspci"
    git
    glib
    killall
    httpie
    inotify-tools
    wget
    jq
    starship
    neovim
    rustup
    zsh-vi-mode
    xdg-utils
    bluez

    # GUI shit
    bottles
    pulseaudio # provides pactl
    gtk3
    gtk4
    gsettings-desktop-schemas
    gnome.dconf-editor
    gnome.adwaita-icon-theme
    gnome.nautilus
    gthumb
    evince
    xorg.xrdb # Read $HOME/.Xresources
    darkman
    hyprpaper
    iwgtk
    kitty
    j4-dmenu-desktop
    insync # Client for Dropbox, Google Drive etc.
    neovide
    pavucontrol
    playerctl
    tofi
    viber
    telegram-desktop
    discord
    slack
    slack-term
    vivaldi
    vivaldi-ffmpeg-codecs
    obsidian
    unstable.waybar
    wl-clipboard
    mako
    slurp # Tool for selecting a screen region on wayland, nice for screenshots
    grim # Grab screenshots from Wayland compositors
    swappy # Super light weight image editor, nice for screenshots
    qt6.qtwayland # Wayland support for Qt
    libsForQt5.qt5.qtwayland # Wayland support for Qt
    xorg.xprop # For the "xprop" command
    google-chrome
  ];

  fonts.fonts = with pkgs; [
    (nerdfonts.override { fonts = [ "CascadiaCode" "FiraCode" "Iosevka" ]; })
  ];

  #
  # Programs
  #

  programs.hyprland = {
    package = unstable.hyprland;
    enable = true;
    xwayland.enable = true;
    xwayland.hidpi = true;
  };

  programs.zsh = {
    enable = true;
    promptInit = ''
      # Starship
      eval "$(starship init zsh)"
    '';

    interactiveShellInit = ''
      # Custom shell stuff
      if [[ -f "$HOME/syncthing/dotfiles/shell-setup.zsh" ]]; then
        source "$HOME/syncthing/dotfiles/shell-setup.zsh"
      fi

      # FZF
      export FZF_DEFAULT_OPTS="--height 50% --reverse --border"
      export FZF_CTRL_R_OPTS="--height 10 --layout default"
      export FZF_DEFAULT_COMMAND='
        find . -type f,l \
          ! -path "*/.*/*" \
          ! -path "*/__pycache__/*" \
          ! -path "*/node_modules/*"'

      # zsh-vi-mode
      source "${pkgs.zsh-vi-mode}/share/zsh-vi-mode/zsh-vi-mode.plugin.zsh"
      zvm_after_init_commands+=('__zvm_apply_keybindings')
      function __zvm_apply_keybindings() {
        source "$(fzf-share)/key-bindings.zsh"
      }
      export ZVM_INSERT_MODE_CURSOR=$ZVM_CURSOR_BLINKING_BEAM
      export ZVM_NORMAL_MODE_CURSOR=$ZVM_CURSOR_BLOCK
      export ZVM_LINE_INIT_MODE=$ZVM_MODE_INSERT
      export ZVM_CURSOR_STYLE_ENABLED=true
    '';
  };

  programs.fzf = {
    keybindings = true;
    fuzzyCompletion = true;
  };

  #
  # Services
  #

  services.dbus.enable = true;

  services.syncthing = {
    enable = true;
    user = "tomas";
    dataDir = "/home/tomas/syncthing";
    configDir = "/home/tomas/.config/syncthing";
    overrideDevices = true;
    devices = {
      # REDACTED
    };
  };

  sound.enable = true;
  services.pipewire = {
    enable = true;
    alsa.enable = true;
    alsa.support32Bit = true;
    pulse.enable = true;
  };

  xdg.portal = {
    enable = true;
    extraPortals = [ pkgs.xdg-desktop-portal-gtk pkgs.xdg-desktop-portal-hyprland ];
  };

  services.gnome.gnome-settings-daemon.enable = true;

  # Configure keymap in X11
  services.xserver = {
    layout = "no";
    xkbVariant = "";
  };

  # Open ports in the firewall.
  # networking.firewall.allowedTCPPorts = [ ... ];
  # networking.firewall.allowedUDPPorts = [ ... ];
  # Or disable the firewall altogether.
  # networking.firewall.enable = false;

  # This value determines the NixOS release from which the default
  # settings for stateful data, like file locations and database versions
  # on your system were taken. It‘s perfectly fine and recommended to leave
  # this value at the release version of the first install of this system.
  # Before changing this value read the documentation for this option
  # (e.g. man configuration.nix or on https://nixos.org/nixos/options.html).
  system.stateVersion = "23.05"; # Did you read the comment?
}

Can you show your kernel log when such a thing happens?

You probably are running into hardware issues or broken disk.

This is the entire kernel log for the last boot (got it with sudo journalctl -k -b -1): https://pastebin.com/raw/EnjUMb67

As I mentioned in the original post, there’s no sign of issues in the kernel log as far as I can see. There are no new entries after I first booted the machine. The PC was powered over night and when I sat back down the next day, the / mount was suddenly read-only. This has happened every time I’ve left the PC for several hours or over night.

Seems unlikely that I can blame my hardware when Arch has been running stably for around 2 years. This only started happening after I switched to NixOS. It seems more likely that I’ve screwed something up or misunderstood how NixOS is supposed to work.

For example, I find it very strange that the same partition appears to be mounted to 2 different locations:

$ mount
...
/dev/mapper/main-nixos on / type ext4 (rw,relatime)
/dev/mapper/main-nixos on /nix/store type ext4 (ro,relatime)
...

However, when I list the contents of these locations they are completely different:

$ ls /
bin  boot  dev  etc  home  lost+found  mnt  nix  proc  root  run  srv  sys  tmp  usr  var

$ ls /nix/store/
001gp43bjqzx60cg345n2slzg7131za8-nix-nss-open-files.patch
0036j7srhz34adpx9ag18743i7q6pvmw-psqlodbc-10.01.0000.tar.gz.drv
004z1pwkshnhl29dpv2ll8h649qjbhsm-source.drv
009bjlx48wwsh2kaicn2r9lflzxl087j-libgit2-1.6.4.drv
009dvj8zvcxpbah1ikmjqn1sgwdljd2q-poppler-data-0.4.12
...

Is this normal? How is it even possible? :confused:

Some distros mount the root FS readonly, but that’s not typical on NixOS (you’d have to do a lot of things to make that work).

This is normal, and it’s called a “bind mount”. Basically NixOS does mount --bind -o ro /nix/store /nix/store which says to turn /nix/store into a bind mount, which is basically just a window back into the original /nix/store, except now readonly. This is not related to your problem, and a readonly bind mount for /nix/store is normal. The mount table just makes it look like the root partition is mounted at two different places, because the bind mount is a window into a directory on the root partition.


Anyway, as for your specific problem, the root FS suddenly becoming readonly is almost always due to some failure of the FS, like corruption or failing hardware. Certainly the fsck messages in stage-1-init are not promising, though it seems to think it recovered fully. But it’s not surprising that you don’t see indication of hardware failure in journalctl -k -b -1, because that’s the logs for the previous boot, and the FS would have gone readonly before any logs about hardware failure could have been written.

Next time, try and capture the output of dmesg if you can before you power the system off. That way logs indicating hardware failure are actually captured.

3 Likes

I think it might be related to the following issue: NVMe Issues on recent kernels. · Issue #257159 · NixOS/nixpkgs · GitHub

1 Like

Sounds like it indeed. When my system freezes I can’t run dmesg to see the error log, I just get a “Input/Output error”. Instead, a few days ago, I opened sudo dmesg -Hw, hoping that errors would show up on screen as the disk issue happened.

However, as soon as I did that the issue has stopped happening, it has now been several days. Perhaps it was fixed in a recent update? :thinking: It used to happen every day.

~For me issue seems to be resolved in linux-6.5.6-rc1, still testing and should come out around friday~

NVM, still crashing.

I finally caught the error happening with an already open dmesg instance, and I’m getting the exact same error messages as you, which is comforting.

I just noticed your last posts on the issue, saying that 6.5.6 fixed the issue. I can’t find anything more recent than 6.5.5 in nixpkgs, how did you update to 6.5.6? :thinking:

Forgot to thank you for this reply, it was very educational! :pray:

1 Like

Never mind, the 6.5.6 kernel just showed up on unstable today, I’ve switched to it now. Crossing my fingers the problem is solved :crossed_fingers:

This may or may not be related but for the several NVMe disks connected via USB behave similarly: during heavy writes they sometimes fail and get remounted as read-only under a different device path, e.g /dev/sdc1 while previously being /dev/sdb1.

The remount seems to happen due to some kind of disk failure, though for me it seems to be specifically USB related as I have several similar NVMe disks plugged into M.2 sockets with no problems what so ever.

Updating to kernel 6.5.6 did not resolve the issue for me.

However, applying the kernel params that the error message suggested seems to have worked:

boot.kernelParams = [
  "nvme_core.default_ps_max_latency_us=0"
  "pcie_aspm=off"
];

I haven’t had a single system lockup since I applied these params, which is several weeks.

I was having this issue with a nvme USB case. The solution? A mini fan pointed towards the case!
It was overheating apparently.

I think i am still encountering this error, but i was not able to solve it by passing kernelParameters.

My System-Info: nix-shell -p smartmontools --run "sudo smartctl -a /dev/nvme0"

 - system: `"x86_64-linux"`
 - host os: `Linux 6.1.82, NixOS, 23.11 (Tapir), 23.11.20240328.219951b`
 - multi-user?: `yes`
 - sandbox: `yes`
 - version: `nix-env (Nix) 2.18.1`
 - nixpkgs: `/etc/nix/path/nixpkgs`

Output of smartmontools: nix-shell -p smartmontools --run "sudo smartctl -a /dev/nvme0"

smartctl 7.4 2023-08-01 r5530 [x86_64-linux-6.1.82] (local build)
Copyright (C) 2002-23, Bruce Allen, Christian Franke, www.smartmontools.org

=== START OF INFORMATION SECTION ===
Model Number:                       Samsung SSD 970 EVO Plus 1TB
Serial Number:                      S4EWNX0R411671J
Firmware Version:                   2B2QEXM7
PCI Vendor/Subsystem ID:            0x144d
IEEE OUI Identifier:                0x002538
Total NVM Capacity:                 1.000.204.886.016 [1,00 TB]
Unallocated NVM Capacity:           0
Controller ID:                      4
NVMe Version:                       1.3
Number of Namespaces:               1
Namespace 1 Size/Capacity:          1.000.204.886.016 [1,00 TB]
Namespace 1 Utilization:            838.665.506.816 [838 GB]
Namespace 1 Formatted LBA Size:     512
Namespace 1 IEEE EUI-64:            002538 5411b07d6b
Local Time is:                      Sun Apr 14 14:34:24 2024 CEST
Firmware Updates (0x16):            3 Slots, no Reset required
Optional Admin Commands (0x0017):   Security Format Frmw_DL Self_Test
Optional NVM Commands (0x005f):     Comp Wr_Unc DS_Mngmt Wr_Zero Sav/Sel_Feat Timestmp
Log Page Attributes (0x03):         S/H_per_NS Cmd_Eff_Lg
Maximum Data Transfer Size:         512 Pages
Warning  Comp. Temp. Threshold:     85 Celsius
Critical Comp. Temp. Threshold:     85 Celsius

Supported Power States
St Op     Max   Active     Idle   RL RT WL WT  Ent_Lat  Ex_Lat
 0 +     7.80W       -        -    0  0  0  0        0       0
 1 +     6.00W       -        -    1  1  1  1        0       0
 2 +     3.40W       -        -    2  2  2  2        0       0
 3 -   0.0700W       -        -    3  3  3  3      210    1200
 4 -   0.0100W       -        -    4  4  4  4     2000    8000

Supported LBA Sizes (NSID 0x1)
Id Fmt  Data  Metadt  Rel_Perf
 0 +     512       0         0

=== START OF SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED

SMART/Health Information (NVMe Log 0x02)
Critical Warning:                   0x00
Temperature:                        46 Celsius
Available Spare:                    100%
Available Spare Threshold:          10%
Percentage Used:                    2%
Data Units Read:                    173.927.970 [89,0 TB]
Data Units Written:                 80.724.087 [41,3 TB]
Host Read Commands:                 1.584.946.075
Host Write Commands:                783.296.648
Controller Busy Time:               2.511
Power Cycles:                       2.533
Power On Hours:                     1.406
Unsafe Shutdowns:                   459
Media and Data Integrity Errors:    6
Error Information Log Entries:      5.166
Warning  Comp. Temperature Time:    0
Critical Comp. Temperature Time:    0
Temperature Sensor 1:               46 Celsius
Temperature Sensor 2:               46 Celsius

Error Information (NVMe Log 0x01, 16 of 64 entries)
Num   ErrCount  SQId   CmdId  Status  PELoc          LBA  NSID    VS  Message
  0       5166     0  0x701b  0x4004      -            0     0     -  Invalid Field in Command

Self-test Log (NVMe Log 0x06)
Self-test status: No self-test in progress
No Self-tests Logged

Now because this error always occurs when i am trying to rebuild my nixos system config, i cant actually upgrade to see if the problem goes away.

Does anyone have any hints on what i could try to resolve the issue?

Also i am not sure if the issue i am having is thermal related, as i tried rebuilding directly after booting and it still happened.