Nixos Cannot run program "/bin/ls"

Hello,

I try to run pySpark with spark-nlp

during a “post installation” there is an error that /bin/ls does not exist.

  • What is the best way in nixos to solve that issue
    – would I only put a symlink ?
Py4JJavaError: An error occurred while calling o87.partitions.
: java.lang.RuntimeException: Error while running command to get file permissions : java.io.IOException: Cannot run program "/bin/ls": error=2, No such file or directory
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)

There are two solutions:

  1. Patching, i.e. replace /bin/ls with “${coreutils}/bin/ls”, this require to build it as a package
  2. Provide the path with fhsuserenv i.e. https://gist.github.com/Mic92/b59054188c595e5652cacf50485583e0 provides a container-like environment where /bin/ls would be provided just for the nix-shell session.

Of course if you just want to get it quickly done and the its only needed for installation, than a symlink from /run/current-system/sw/bin/ls to /bin/ls could also serve as a temporary hack.

sorry, i think I have to be more precise:

The python env is created via mach-nix.


I cannot see/find the code position in which a package or library states the “/bin/ls”

Import Spark NLP

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

spark = sparknlp.start()
pipeline = PretrainedPipeline(name= ‘explain_document_dl’, lang=‘en’ , )

explain_document_dl download started this may take some time.
Approx size to download 168.4 MB
[OK!]
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-39-ff889a0cd27d> in <module>
      1 # Download a pre-trained pipeline
----> 2 pipeline = PretrainedPipeline(name= 'explain_document_dl', lang='en' , )
      3 
      4 #pipeline = PretrainedPipeline(name= 'explain_document_dl', lang='en' , disk_location= dwld_models )

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
     89     def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
     90         if not disk_location:
---> 91             self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
     92         else:
     93             self.model = PipelineModel.load(disk_location)

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
     58             t1.start()
     59             try:
---> 60                 j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
     61                 jmodel = PipelineModel._from_java(j_obj)
     62             finally:

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
    179 class _DownloadPipeline(ExtendedJavaWrapper):
    180     def __init__(self, name, language, remote_loc):
--> 181         super(_DownloadPipeline, self).__init__("com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline", name, language, remote_loc)
    182 
    183 

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
    127         super(ExtendedJavaWrapper, self).__init__(java_obj)
    128         self.sc = SparkContext._active_spark_context
--> 129         self._java_obj = self.new_java_obj(java_obj, *args)
    130         self.java_obj = self._java_obj
    131 

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
    137 
    138     def new_java_obj(self, java_class, *args):
--> 139         return self._new_java_obj(java_class, *args)
    140 
    141     def new_java_array(self, pylist, java_class):

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
     65             java_obj = getattr(java_obj, name)
     66         java_args = [_py2java(sc, arg) for arg in args]
---> 67         return java_obj(*java_args)
     68 
     69     @staticmethod

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/nix/store/i72461v4c9wlgln0mn6zjc0gyjl5zcyi-python3-3.7.8-env/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline.
: java.lang.RuntimeException: Error while running command to get file permissions : java.io.IOException: Cannot run program "/bin/ls": error=2, No such file or directory
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1048)
	at org.apache.hadoop.util.Shell.runCommand(Shell.java:523)
	at org.apache.hadoop.util.Shell.run(Shell.java:479)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:866)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:849)
	at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:659)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:634)
	at org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:49)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1733)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1713)
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:270)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1388)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1382)
	at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1423)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1422)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:615)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:267)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:348)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:342)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:376)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:370)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadPipeline(ResourceDownloader.scala:470)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: error=2, No such file or directory
	at java.lang.UNIXProcess.forkAndExec(Native Method)
	at java.lang.UNIXProcess.<init>(UNIXProcess.java:247)
	at java.lang.ProcessImpl.start(ProcessImpl.java:134)
	at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
	... 53 more

	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:634)
	at org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:49)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1733)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1713)
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:270)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:273)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:269)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:269)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1388)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1382)
	at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1423)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1422)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:615)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:267)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:348)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.load(Pipeline.scala:342)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:376)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.downloadPipeline(ResourceDownloader.scala:370)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.downloadPipeline(ResourceDownloader.scala:470)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadPipeline(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)

the creator of the package blame Nixos for the error

The stack trace kinda suggests the problem is in hadoop (or at least that’s where I would start looking). Do you know exactly which version of hadoop is being used here? The line numbers don’t seem to match with trunk (https://github.com/apache/hadoop/blob/7a6265ac425e7ff1426c015e81f8ac882ff7bcf6/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileUtil.java#L1097) anymore.

my system broke …
(so I cannot reproduce it )


if I now build

let
  mach-nix = import (builtins.fetchGit {
    url = "https://github.com/DavHau/mach-nix/";
    ref = "refs/tags/2.4.0";
  });

  nixPkgs = import mach-nix.nixpkgs.path {config= { allowUnfree = true;    permittedInsecurePackages = [    "openssl-1.0.2u"   ];  }; };
  py = nixPkgs.python37Packages ;

  py_env = mach-nix.mkPython rec {
    pkgs = nixPkgs;
    python = nixPkgs.python37;
    requirements =  ''  
    ipython
    jupyterlab
    pandas

    pyspark
    spark-nlp
      '';

    providers = {

    };
  };
in 
nixPkgs.mkShell rec {
  buildInputs = [
    nixPkgs.bash
    nixPkgs.nix-tree

    nixPkgs.jdk
    nixPkgs.spark


    ( (py_env).override( args:{ignoreCollisions=true; })  )
  ] ;

  shellHook = ''
        echo ""
        echo ""
        echo "Your env is:"
        echo  ${py_env}
        echo ""
        echo ""

        # # # install data/pretrained pipes from spark-nlp
        ${py_env}/bin/python -c "import pandas"

        jupyter lab 
        '';
}

the error is

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-1-998537c07f47> in <module>
      5 
      6 spark = sparknlp.start()
----> 7 pipeline = PretrainedPipeline(name= 'explain_document_dl', lang='en' , )

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/sparknlp/pretrained.py in __init__(self, name, lang, remote_loc, parse_embeddings, disk_location)
     89     def __init__(self, name, lang='en', remote_loc=None, parse_embeddings=False, disk_location=None):
     90         if not disk_location:
---> 91             self.model = ResourceDownloader().downloadPipeline(name, lang, remote_loc)
     92         else:
     93             self.model = PipelineModel.load(disk_location)

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/sparknlp/pretrained.py in downloadPipeline(name, language, remote_loc)
     49     def downloadPipeline(name, language, remote_loc=None):
     50         print(name + " download started this may take some time.")
---> 51         file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
     52         if file_size == "-1":
     53             print("Can not find the model to download please check the name!")

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, name, language, remote_loc)
    190     def __init__(self, name, language, remote_loc):
    191         super(_GetResourceSize, self).__init__(
--> 192             "com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize", name, language, remote_loc)
    193 
    194 

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/sparknlp/internal.py in __init__(self, java_obj, *args)
    127         super(ExtendedJavaWrapper, self).__init__(java_obj)
    128         self.sc = SparkContext._active_spark_context
--> 129         self._java_obj = self.new_java_obj(java_obj, *args)
    130         self.java_obj = self._java_obj
    131 

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/sparknlp/internal.py in new_java_obj(self, java_class, *args)
    137 
    138     def new_java_obj(self, java_class, *args):
--> 139         return self._new_java_obj(java_class, *args)
    140 
    141     def new_java_array(self, pylist, java_class):

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
     67             java_obj = getattr(java_obj, name)
     68         java_args = [_py2java(sc, arg) for arg in args]
---> 69         return java_obj(*java_args)
     70 
     71     @staticmethod

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1303         answer = self.gateway_client.send_command(command)
   1304         return_value = get_return_value(
-> 1305             answer, self.gateway_client, self.target_id, self.name)
   1306 
   1307         for temp_arg in temp_args:

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
    126     def deco(*a, **kw):
    127         try:
--> 128             return f(*a, **kw)
    129         except py4j.protocol.Py4JJavaError as e:
    130             converted = convert_exception(e.java_exception)

/nix/store/cz1xwgms5gf69nwc3yc3srqn8zsa0w86-python3-3.7.9-env/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize.
: java.lang.NoClassDefFoundError: org/apache/spark/ml/util/MLReadable$class
	at com.johnsnowlabs.nlp.DocumentAssembler$.<init>(DocumentAssembler.scala:178)
	at com.johnsnowlabs.nlp.DocumentAssembler$.<clinit>(DocumentAssembler.scala)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.<init>(ResourceDownloader.scala:431)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.<clinit>(ResourceDownloader.scala)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.ml.util.MLReadable$class
	at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	... 16 more

there is a warning but I’m not sure what that means

20/09/21 12:08:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable