docker/config/docspell.conf

docspell.server {

  # This is shown in the top right corner of the web application
  app-name = "Docspell"

  # This is the id of this node. If you run more than one server, you
  # have to make sure to provide unique ids per node.
  app-id = "rest1"

  # This is the base URL this application is deployed to. This is used
  # to create absolute URLs and to configure the cookie.
  #
  # If default is not changed, the HOST line of the login request is
  # used instead or the value of the `X-Forwarded-For` header. If set
  # to some other value, the request is not inspected.
  base-url = "https://docs.pukeko.xyz"

  # This url is the base url for reaching this server internally.
  # While you might set `base-url` to some external address (like
  # mydocs.myserver.com), the `internal-url` must be set such that
  # other nodes can reach this server.
  internal-url = "http://restserver:7880"

  # Configures logging
  logging {
    # The format for the log messages. Can be one of:
    # Json, Logfmt, Fancy or Plain
    format = "Fancy"

    # The minimum level to log. From lowest to highest:
    # Trace, Debug, Info, Warn, Error
    minimum-level = "Warn"

    # Override the log level of specific loggers
    levels = {
      "docspell" = "Info"
      "org.flywaydb" = "Info"
      "binny" = "Info"
      "org.http4s" = "Info"
    }
  }

  # Where the server binds to.
  bind {
    address = "0.0.0.0"
    port = 7880
  }

  # Options for tuning the http server
  server-options {
    enable-http-2 = false

    # Maximum allowed connections
    max-connections = 1024

    # Timeout for waiting for the first output of the response
    response-timeout = 45s
  }

  # This is a hard limit to restrict the size of a batch that is
  # returned when searching for items. The user can set this limit
  # within the client config, but it is restricted by the server to
  # the number defined here. An admin might choose a lower number
  # depending on the available resources.
  max-item-page-size = 200

  # The number of characters to return for each item notes when
  # searching. Item notes may be very long, when returning them with
  # all the results from a search, they add quite some data to return.
  # In order to keep this low, a limit can be defined here.
  max-note-length = 180

  # This defines whether the classification form in the collective
  # settings is displayed or not. If all joex instances have document
  # classification disabled, it makes sense to hide its settings from
  # users.
  show-classification-settings = true

  # Authentication.
  auth {

    # The secret for this server that is used to sign the authenicator
    # tokens. If multiple servers are running, all must share the same
    # secret. You can use base64 or hex strings (prefix with b64: and
    # hex:, respectively). If empty, a random secret is generated.
    # Example: b64:YRx77QujCGkHSvll0TVEmtTaw3Z5eXr+nWMsEJowgKg=
    server-secret = ""

    # How long an authentication token is valid. The web application
    # will get a new one periodically.
    session-valid = "5 minutes"

    remember-me {
      enabled = true
      # How long the remember me cookie/token is valid.
      valid = "30 days"
    }

    # One of: fail, convert
    #
    # Accounts can be local or defined at a remote provider and
    # integrated via OIDC. If the same account is defined in both
    # sources, docspell by default fails if a user mixes logins (e.g.
    # when registering a user locally and then logging in with the
    # same user via OIDC). When set to `convert` docspell treats it as
    # being the same and simply updates the account to reflect the new
    # account source.
    on-account-source-conflict = "fail"
  }

  # Settings for "download as zip"
  download-all {
    # How many files to allow in a zip.
    max-files = 500

    # The maximum (uncompressed) size of the zip file contents.
    max-size = 1400M
  }

  # Configures OpenID Connect (OIDC) or OAuth2 authentication. Only
  # the "Authorization Code Flow" is supported.
  #
  # Multiple authentication providers can be defined. Each is
  # configured in the array below. The `provider` block gives all
  # details necessary to authenticate against an external OIDC or
  # OAuth provider. This requires at least two URLs for OIDC and three
  # for OAuth2. When using OIDC, the `user-url` is only required if
  # the account data is to be retrieved from the user-info endpoint
  # and not from the JWT token. For the request to the `user-url`, the
  # access token is then used to authenticate at the provider. Thus,
  # it doesn't need to be validated here and therefore no `sign-key`
  # setting is needed. However, if you want to extract the account
  # information from the access token, it must be validated here and
  # therefore the correct signature key and algorithm must be
  # provided. If the `sign-key` is left empty, the `user-url` is used
  # and must be specified. If the `sign-key` is _not_ empty, the
  # response from the authentication provider is validated using this
  # key.
  #
  # If a `logout-url` is provided, it will be used to finally redirect
  # the browser to this url that should logout the user from Docspell
  # at the provider.
  #
  # After successful authentication, docspell needs to create the
  # account. For this a username and collective name is required. The
  # account name is defined by the `user-key` and `collective-key`
  # setting. The `user-key` is used to search the JSON structure, that
  # is obtained from the JWT token or the user-info endpoint, for the
  # login name to use. It traverses the JSON structure recursively,
  # until it finds an object with that key. The first value is used.
  #
  # The `collective-key` can be used in multiple ways and both can
  # work together to retrieve the full account id:
  #
  # - If it starts with `fixed:`, like "fixed:collective", the name
  #   after the `fixed:` prefix is used as collective as is. So all
  #   users are in the same collective.
  #
  # - If it starts with `lookup:`, like "lookup:collective_name", the
  #   value after the prefix is used to search the JSON response for
  #   an object with this key, just like it works with the `user-key`.
  #
  # - If it starts with `account:`, like "account:demo", it works the
  #   same as `lookup:` only that the value is interpreted as the full
  #   account name of form `collective/login`. The `user-key` value is
  #   ignored in this case.
  #
  # If these values cannot be obtained from the response, docspell
  # fails the authentication. It is then assumed that the successfully
  # authenticated user at the OP has not enough permissions to access
  # docspell.
  #
  # Below are examples for OpenID Connect (keycloak) and OAuth2
  # (github).
  openid =
    [ { enabled = false,

        # The name to render on the login link/button.
        display = "Keycloak"

        # This illustrates to use a custom keycloak setup as the
        # authentication provider. For details, please refer to the
        # keycloak documentation. The settings here assume a certain
        # configuration at keycloak.
        #
        # Keycloak can be configured to return the collective name for
        # each user in the access token. It may also be configured to
        # return it in the user info response. If it is already in the
        # access token, an additional request can be omitted. Set the
        # `sign-key` to an empty string then. Otherwise provide the
        # algo and key from your realm settings. In this example, the
        # realm is called "home".
        provider = {
          provider-id = "keycloak",
          client-id = "docspell",
          client-secret = "example-secret-439e-bf06-911e4cdd56a6",
          scope = "profile", # scope is required for OIDC
          authorize-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/auth",
          token-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/token",
          #User URL is not used when signature key is set.
          #user-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/userinfo",
          logout-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/logout"
          sign-key = "b64:anVzdC1hLXRlc3Q=",
          sig-algo = "RS512"
        },
        # The collective of the user is given in the access token as
        # property `docspell_collective`.
        collective-key = "lookup:docspell_collective",
        # The username to use for the docspell account
        user-key = "preferred_username"
      },
      { enabled = true,

        # The name to render on the login link/button.
        display = "Authelia"
        provider = {
          provider-id = "authelia",
          client-id = "docspell",
          client-secret = "tEf47Me$YsXG8K4%63$%!kbMqbgVnc*bAq2i4SPERay#T!&ajc35m&D%C#uRMiaSv@cRFxwMcqo%SwEq*49G9HufJ&d#^f*&MK9hzU6s&7C2^XmfGC8Up7YeegnH#VhP",
          scope = "openid profile groups email", # scope is not needed for github
          authorize-url = "https://auth.pukeko.xyz/api/oidc/authorize",
          token-url = "https://auth.pukeko.xyz/api/oidc/token",
          user-url = "https://auth.pukeko.xyz/api/oidc/userinfo",
          sign-key = "" # this must be set empty
          sig-algo = "RS256" #unused but must be set to something
        },

        # If the authentication provider doesn't provide the
        # collective name, simply use a fixed one. This means all
        # users from this provider are in the same collective!
        collective-key = "fixed:shmick",

        # Github provides the login name via the `login` property as
        # response from the user-url. This value is used to construct
        # the account in docspell.
        user-key = "preferred_username"
      }
    ]

  # When exactly one OIDC/OAuth provider is configured, then the weapp
  # automatically redirects to its authentication page skipping the
  # docspell login page.
  oidc-auto-redirect = true

  # This endpoint allows to upload files to any collective. The
  # intention is that local software integrates with docspell more
  # easily. Therefore the endpoint is not protected by the usual
  # means.
  #
  # For security reasons, this endpoint is disabled by default. If
  # enabled, you can choose from some ways to protect it. It may be a
  # good idea to further protect this endpoint using a firewall, such
  # that outside traffic is not routed.
  #
  # NOTE: If all protection methods are disabled, the endpoint is not
  # protected at all!
  integration-endpoint {
    enabled = false

    # The priority to use when submitting files through this endpoint.
    priority = "low"

    # The name used for the item "source" property when uploaded
    # through this endpoint.
    source-name = "integration"

    # IPv4 addresses to allow access. An empty list, if enabled,
    # prohibits all requests. IP addresses may be specified as simple
    # globs: a part marked as `*' matches any octet, like in
    # `192.168.*.*`. The `127.0.0.1' (the default) matches the
    # loopback address.
    allowed-ips {
      enabled = false
      ips = [ "127.0.0.1" ]
    }

    # Requests are expected to use http basic auth when uploading
    # files.
    http-basic {
      enabled = false
      realm = "Docspell Integration"
      user = "docspell-int"
      password = "docspell-int"
    }

    # Requests are expected to supply some specific header when
    # uploading files.
    http-header {
      enabled = false
      header-name = "Docspell-Integration"
      header-value = "some-secret"
    }
  }

  # This is a special endpoint that allows some basic administration.
  #
  # It is intended to be used by admins only, that is users who
  # installed the app and have access to the system. Normal users
  # should not have access and therefore a secret must be provided in
  # order to access it.
  #
  # This is used for some endpoints, for example:
  # - re-create complete fulltext index:
  #   curl -XPOST -H'Docspell-Admin-Secret: xyz' http://docspell-restserver:7880/api/v1/admin/fts/reIndexAll
  admin-endpoint {
    # The secret. If empty, the endpoint is disabled.
    secret = ""
  }

  # Configuration of the full-text search engine. (the same must be used for joex)
  full-text-search {
    # The full-text search feature can be disabled. It requires an
    # additional index server which needs additional memory and disk
    # space. It can be enabled later any time.
    #
    # Currently the SOLR search platform and PostgreSQL is supported.
    enabled = false

    # Which backend to use, either solr or postgresql
    backend = "solr"

    # Configuration for the SOLR backend.
    solr = {
      # The URL to solr
      url = "http://localhost:8983/solr/docspell"
      # Used to tell solr when to commit the data
      commit-within = 1000
      # If true, logs request and response bodies
      log-verbose = false
      # The defType parameter to lucene that defines the parser to
      # use. You might want to try "edismax" or look here:
      # https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
      def-type = "lucene"
      # The default combiner for tokens. One of {AND, OR}.
      q-op = "OR"
    }

    # Configuration for PostgreSQL backend
    postgresql = {
      # Whether to use the default database, only works if it is
      # postgresql
      use-default-connection = false

      # The database connection.
      jdbc {
        url = "jdbc:postgresql://server:5432/db"
        user = "pguser"
        password = ""
      }

      # A mapping from a language to a postgres text search config. By
      # default a language is mapped to a predefined config.
      # PostgreSQL has predefined configs for some languages. This
      # setting allows to create a custom text search config and
      # define it here for some or all languages.
      #
      # Example:
      #  { german = "my-german" }
      #
      # See https://www.postgresql.org/docs/14/textsearch-tables.html ff.
      pg-config = {
      }

      # Define which query parser to use.
      #
      # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
      pg-query-parser = "websearch_to_tsquery"

      # Allows to define a normalization for the ranking.
      #
      # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING
      pg-rank-normalization = [ 4 ]
    }
  }

  # Configuration for the backend.
  backend {

    # Enable or disable debugging for e-mail related functionality. This
    # applies to both sending and receiving mails. For security reasons
    # logging is not very extensive on authentication failures. Setting
    # this to true, results in a lot of data printed to stdout.
    mail-debug = false

    # The database connection.
    jdbc {
      # The JDBC url to the database. By default a H2 file-based
      # database is configured. You can provide a postgresql or
      # mariadb connection here. When using H2 use the PostgreSQL
      # compatibility mode and AUTO_SERVER feature.
      #url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
      url = "jdbc:postgresql://db:5432/dbname"

      # The database user.
      user = "dbuser"

      # The database password.
      password = "dbpass"
    }

    # Additional settings related to schema migration.
    database-schema = {
      # Whether to run main database migrations.
      run-main-migrations = true

      # Whether to run the fixup migrations.
      run-fixup-migrations = true

      # Use with care. This repairs all migrations in the database by
      # updating their checksums and removing failed migrations. Good
      # for testing, not recommended for normal operation.
      repair-schema = false
    }

    # Configuration for registering new users.
    signup {

      # The mode defines if new users can signup or not. It can have
      # three values:
      #
      # - open: every new user can sign up
      # - invite: new users can sign up only if they provide a correct
      #   invitation key. Invitation keys can be generated by the
      #   server.
      # - closed: signing up is disabled.
      mode = "open"

      # If mode == 'invite', a password must be provided to generate
      # invitation keys. It must not be empty.
      new-invite-password = ""

      # If mode == 'invite', this is the period an invitation token is
      # considered valid.
      invite-time = "3 days"
    }

    files {
      # Defines the chunk size (in bytes) used to store the files.
      # This will affect the memory footprint when uploading and
      # downloading files. At most this amount is loaded into RAM for
      # down- and uploading.
      #
      # It also defines the chunk size used for the blobs inside the
      # database.
      chunk-size = 2097152

      # The file content types that are considered valid. Docspell
      # will only pass these files to processing. The processing code
      # itself has also checks for which files are supported and which
      # not. This affects the uploading part and can be used to
      # restrict file types that should be handed over to processing.
      # By default all files are allowed.
      valid-mime-types = [ ]

      # The id of an enabled store from the `stores` array that should
      # be used.
      #
      # IMPORTANT NOTE: All nodes must have the exact same file store
      # configuration!
      default-store = "database"

      # A list of possible file stores. Each entry must have a unique
      # id. The `type` is one of: default-database, filesystem, s3.
      #
      # The enabled property serves currently to define target stores
      # for te "copy files" task. All stores with enabled=false are
      # removed from the list. The `default-store` must be enabled.
      stores = {
        database =
          { enabled = true
            type = "default-database"
          }

        filesystem =
          { enabled = false
            type = "file-system"
            directory = "/some/directory"
          }

        minio =
         { enabled = false
           type = "s3"
           endpoint = "http://localhost:9000"
           access-key = "username"
           secret-key = "password"
           bucket = "docspell"
         }
      }
    }

    addons = {
      enabled = false

      # Whether installing addons requiring network should be allowed
      # or not.
      allow-impure = true

      # Define patterns of urls that are allowed to install addons
      # from.
      #
      # A pattern is compared against an URL by comparing three parts
      # of an URL via globs: scheme, host and path.
      #
      # You can use '*' (0 or more) and '?' (one) as wildcards in each
      # part. For example:
      #
      #   https://*.mydomain.com/projects/*
      #   *s://gitea.mydomain/*
      #
      # A hostname is separated by dots and the path by a slash. A '*'
      # in a pattern means to match one or more characters. The path
      # pattern is always matching the given prefix. So /a/b/* matches
      # /a/b/c and /a/b/c/d and all other sub-paths.
      #
      # Multiple patterns can be defined va a comma separated string
      # or as an array. An empty string matches no URL, while the
      # special pattern '*' all by itself means to match every URL.
      allowed-urls = "*"

      # Same as `allowed-urls` but a match here means do deny addons
      # from this url.
      denied-urls = ""
    }
  }
}
docspell.joex {

  # This is the id of this node. If you run more than one server, you
  # have to make sure to provide unique ids per node.
  app-id = "joex1"


  # This is the base URL this application is deployed to. This is used
  # to register this joex instance such that docspell rest servers can
  # reach them
  base-url = "http://docspell-joex:7878"

  # Where the REST server binds to.
  #
  # JOEX provides a very simple REST interface to inspect its state.
  bind {
    address = "0.0.0.0"
    port = 7878
  }

  # Configures logging
  logging {
    # The format for the log messages. Can be one of:
    # Json, Logfmt, Fancy or Plain
    format = "Fancy"

    # The minimum level to log. From lowest to highest:
    # Trace, Debug, Info, Warn, Error
    minimum-level = "Warn"

    # Override the log level of specific loggers
    levels = {
      "docspell" = "Info"
      "org.flywaydb" = "Info"
      "binny" = "Info"
      "org.http4s" = "Info"
    }
  }

  # The database connection.
  #
  # It must be the same connection as the rest server is using.
  jdbc {

    # The JDBC url to the database. By default a H2 file-based
    # database is configured. You can provide a postgresql or mariadb
    # connection here. When using H2 use the PostgreSQL compatibility
    # mode and AUTO_SERVER feature.
    #url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
    url = "jdbc:postgresql://db:5432/dbname"
    # The database user.
    user = "dbuser"
    # The database password.
    password = "dbpass"
  }

  # Additional settings related to schema migration.
  database-schema = {
    # Whether to run main database migrations.
    run-main-migrations = true

    # Whether to run the fixup migrations.
    run-fixup-migrations = true

    # Use with care. This repairs all migrations in the database by
    # updating their checksums and removing failed migrations. Good
    # for testing, not recommended for normal operation.
    repair-schema = false
  }

  # Enable or disable debugging for e-mail related functionality. This
  # applies to both sending and receiving mails. For security reasons
  # logging is not very extensive on authentication failures. Setting
  # this to true, results in a lot of data printed to stdout.
  mail-debug = false

  send-mail {
    # This is used as the List-Id e-mail header when mails are sent
    # from docspell to its users (example: for notification mails). It
    # is not used when sending to external recipients. If it is empty,
    # no such header is added. Using this header is often useful when
    # filtering mails.
    #
    # It should be a string in angle brackets. See
    # https://tools.ietf.org/html/rfc2919 for a formal specification
    # of this header.
    list-id = ""
  }

  # Configuration for the job scheduler.
  scheduler {

    # Each scheduler needs a unique name. This defaults to the node
    # name, which must be unique, too.
    name = ${docspell.joex.app-id}

    # Number of processing allowed in parallel.
    pool-size = 1

    # A counting scheme determines the ratio of how high- and low-prio
    # jobs are run. For example: 4,1 means run 4 high prio jobs, then
    # 1 low prio and then start over.
    counting-scheme = "4,1"

    # How often a failed job should be retried until it enters failed
    # state. If a job fails, it becomes "stuck" and will be retried
    # after a delay.
    retries = 2

    # The delay until the next try is performed for a failed job. This
    # delay is increased exponentially with the number of retries.
    retry-delay = "1 minute"

    # The queue size of log statements from a job.
    log-buffer-size = 500

    # If no job is left in the queue, the scheduler will wait until a
    # notify is requested (using the REST interface). To also retry
    # stuck jobs, it will notify itself periodically.
    wakeup-period = "30 minutes"
  }

  periodic-scheduler {

    # Each scheduler needs a unique name. This defaults to the node
    # name, which must be unique, too.
    name = ${docspell.joex.app-id}

    # A fallback to start looking for due periodic tasks regularily.
    # Usually joex instances should be notified via REST calls if
    # external processes change tasks. But these requests may get
    # lost.
    wakeup-period = "10 minutes"
  }

  # Configuration for the user-tasks.
  user-tasks {
    # Allows to import e-mails by scanning a mailbox.
    scan-mailbox {
      # A limit of how many folders to scan through. If a user
      # configures more than this, only upto this limit folders are
      # scanned and a warning is logged.
      max-folders = 50

      # How many mails (headers only) to retrieve in one chunk.
      #
      # If this is greater than `max-mails' it is set automatically to
      # the value of `max-mails'.
      mail-chunk-size = 50

      # A limit on how many mails to process in one job run. This is
      # meant to avoid too heavy resource allocation to one
      # user/collective.
      #
      # If more than this number of mails is encountered, a warning is
      # logged.
      max-mails = 500
    }
  }


  # Docspell uses periodic house keeping tasks, like cleaning expired
  # invites, that can be configured here.
  house-keeping {

    # When the house keeping tasks execute. Default is to run every
    # week.
    schedule = "Sun *-*-* 00:00:00 UTC"

    # This task removes invitation keys that have been created but not
    # used. The timespan here must be greater than the `invite-time'
    # setting in the rest server config file.
    cleanup-invites = {

      # Whether this task is enabled.
      enabled = true

      # The minimum age of invites to be deleted.
      older-than = "30 days"
    }

    # This task removes expired remember-me tokens. The timespan
    # should be greater than the `valid` time in the restserver
    # config.
    cleanup-remember-me = {
      # Whether the job is enabled.
      enabled = true

      # The minimum age of tokens to be deleted.
      older-than = "30 days"
    }

    # Jobs store their log output in the database. Normally this data
    # is only interesting for some period of time. The processing logs
    # of old files can be removed eventually.
    cleanup-jobs = {

      # Whether this task is enabled.
      enabled = true

      # The minimum age of jobs to delete. It is matched against the
      # `finished' timestamp.
      older-than = "30 days"

      # This defines how many jobs are deleted in one transaction.
      # Since the data to delete may get large, it can be configured
      # whether more or less memory should be used.
      delete-batch = "100"
    }

    # Zip files created for downloading multiple files are cached and
    # can be cleared periodically.
    cleanup-downloads = {

      # Whether to enable clearing old download archives.
      enabled = true

      # The minimum age of a download file to be deleted.
      older-than = "14 days"
    }

    # Removes node entries that are not reachable anymore.
    check-nodes {
      # Whether this task is enabled
      enabled = true
      # How often the node must be unreachable, before it is removed.
      min-not-found = 2
    }

    # Checks all files against their checksum
    integrity-check {
      enabled = true
    }
  }

  # A periodic task to check for new releases of docspell. It can
  # inform about a new release via e-mail. You need to specify an
  # account that has SMTP settings to use for sending.
  update-check {
    # Whether to enable this task
    enabled = false

    # Sends the mail without checking the latest release. Can be used
    # if you want to see if mail sending works, but don't want to wait
    # until a new release is published.
    test-run = false

    # When the update check should execute. Default is to run every
    # week. You can specify a time zone identifier, like
    # 'Europe/Berlin' at the end.
    schedule = "Sun *-*-* 00:00:00 UTC"

    # An account id in form of `collective/user` (or just `user` if
    # collective and user name are the same). This user account must
    # have at least one valid SMTP settings which are used to send the
    # mail.
    sender-account = ""

    # The SMTP connection id that should be used for sending the mail.
    smtp-id = ""

    # A list of recipient e-mail addresses.
    # Example: `[ "john.doe@gmail.com" ]`
    recipients = []

    # The subject of the mail. It supports the same variables as the
    # body.
    subject = "Docspell {{ latestVersion }} is available"

    # The body of the mail. Subject and body can contain these
    # variables which are replaced:
    #
    # - `latestVersion` the latest available version of Docspell
    # - `currentVersion` the currently running (old) version of Docspell
    # - `releasedAt` a date when the release was published
    #
    # The body is processed as markdown after the variables have been
    # replaced.
    body = """
Hello,

You are currently running Docspell {{ currentVersion }}. Version *{{ latestVersion }}*
is now available, which was released on {{ releasedAt }}. Check the release page at:

<https://github.com/eikek/docspell/releases/latest>

Have a nice day!

Docpell Update Check
"""
  }

  # Configuration of text extraction
  extraction {
    # For PDF files it is first tried to read the text parts of the
    # PDF. But PDFs can be complex documents and they may contain text
    # and images. If the returned text is shorter than the value
    # below, OCR is run afterwards. Then both extracted texts are
    # compared and the longer will be used.
    #
    # If you set this to 0 (or a negative value), then the text parts
    # of a PDF are ignored and OCR is always run and its result used.
    pdf {
      min-text-len = 500
    }

    preview {
      # When rendering a pdf page, use this dpi. This results in
      # scaling the image. A standard A4 page rendered at 96dpi
      # results in roughly 790x1100px image. Using 32 results in
      # roughly 200x300px image.
      #
      # Note, when this is changed, you might want to re-generate
      # preview images. Check the api for this, there is an endpoint
      # to regenerate all for a collective.
      dpi = 32
    }

    # Extracting text using OCR works for image and pdf files. It will
    # first run ghostscript to create a gray image from a pdf. Then
    # unpaper is run to optimize the image for the upcoming ocr, which
    # will be done by tesseract. All these programs must be available
    # in your PATH or the absolute path can be specified below.
    ocr {

      # Images greater than this size are skipped. Note that every
      # image is loaded completely into memory for doing OCR. This is
      # the pixel count, `height * width` of the image.
      max-image-size = 28000000

      # Defines what pages to process. If a PDF with 600 pages is
      # submitted, it is probably not necessary to scan through all of
      # them. This would take a long time and occupy resources for no
      # value. The first few pages should suffice. The default is first
      # 10 pages.
      #
      # If you want all pages being processed, set this number to -1.
      #
      # Note: if you change the ghostscript command below, be aware that
      # this setting (if not -1) will add another parameter to the
      # beginning of the command.
      page-range {
        begin = 10
      }

      # The ghostscript command.
      ghostscript {
        command {
          program = "gs"
          args = [ "-dNOPAUSE"
                 , "-dBATCH"
                 , "-dSAFER"
                 , "-sDEVICE=tiffscaled8"
                 , "-sOutputFile={{outfile}}"
                 , "{{infile}}"
                 ]
          timeout = "5 minutes"
        }
        working-dir = ${java.io.tmpdir}"/docspell-extraction"
      }

      # The unpaper command.
      unpaper {
        command {
          program = "unpaper"
          args = [ "{{infile}}", "{{outfile}}" ]
          timeout = "5 minutes"
        }
      }

      # The tesseract command.
      tesseract {
        command {
          program = "tesseract"
          args = ["{{file}}"
                 , "stdout"
                 , "-l"
                 , "{{lang}}"
                 ]
          timeout = "5 minutes"
        }
      }
    }
  }

  # Settings for text analysis
  text-analysis {
    # Maximum length of text to be analysed.
    #
    # All text to analyse must fit into RAM. A large document may take
    # too much heap. Also, most important information is at the
    # beginning of a document, so in most cases the first two pages
    # should suffice. Default is 5000, which are about 2 pages (just a
    # rough guess, of course). For my data, more than 80% of the
    # documents are less than 5000 characters.
    #
    # This values applies to nlp and the classifier. If this value is
    # <= 0, the limit is disabled.
    max-length = 0

    # A working directory for the analyser to store temporary/working
    # files.
    working-dir = ${java.io.tmpdir}"/docspell-analysis"

    nlp {
      # The mode for configuring NLP models:
      #
      # 1. full – builds the complete pipeline
      # 2. basic - builds only the ner annotator
      # 3. regexonly - matches each entry in your address book via regexps
      # 4. disabled - doesn't use any stanford-nlp feature
      #
      # The full and basic variants rely on pre-build language models
      # that are available for only a few languages. Memory usage
      # varies among the languages. So joex should run with -Xmx1400M
      # at least when using mode=full.
      #
      # The basic variant does a quite good job for German and
      # English. It might be worse for French, always depending on the
      # type of text that is analysed. Joex should run with about 500M
      # heap, here again lanugage German uses the most.
      #
      # The regexonly variant doesn't depend on a language. It roughly
      # works by converting all entries in your addressbook into
      # regexps and matches each one against the text. This can get
      # memory intensive, too, when the addressbook grows large. This
      # is included in the full and basic by default, but can be used
      # independently by setting mode=regexner.
      #
      # When mode=disabled, then the whole nlp pipeline is disabled,
      # and you won't get any suggestions. Only what the classifier
      # returns (if enabled).
      mode = full

      # The StanfordCoreNLP library caches language models which
      # requires quite some amount of memory. Setting this interval to a
      # positive duration, the cache is cleared after this amount of
      # idle time. Set it to 0 to disable it if you have enough memory,
      # processing will be faster.
      #
      # This has only any effect, if mode != disabled.
      clear-interval = "15 minutes"

      # Restricts proposals for due dates. Only dates earlier than this
      # number of years in the future are considered.
      max-due-date-years = 10

      regex-ner {
        # Whether to enable custom NER annotation. This uses the
        # address book of a collective as input for NER tagging (to
        # automatically find correspondent and concerned entities). If
        # the address book is large, this can be quite memory
        # intensive and also makes text analysis much slower. But it
        # improves accuracy and can be used independent of the
        # lanugage. If this is set to 0, it is effectively disabled
        # and NER tagging uses only statistical models (that also work
        # quite well, but are restricted to the languages mentioned
        # above).
        #
        # Note, this is only relevant if nlp-config.mode is not
        # "disabled".
        max-entries = 1000

        # The NER annotation uses a file of patterns that is derived
        # from a collective's address book. This is is the time how
        # long this data will be kept until a check for a state change
        # is done.
        file-cache-time = "1 minute"
      }
    }

    # Settings for doing document classification.
    #
    # This works by learning from existing documents. This requires a
    # satstical model that is computed from all existing documents.
    # This process is run periodically as configured by the
    # collective. It may require more memory, depending on the amount
    # of data.
    #
    # It utilises this NLP library: https://nlp.stanford.edu/.
    classification {
      # Whether to enable classification globally. Each collective can
      # enable/disable auto-tagging. The classifier is also used for
      # finding correspondents and concerned entities, if enabled
      # here.
      enabled = true

      # If concerned with memory consumption, this restricts the
      # number of items to consider. More are better for training. A
      # negative value or zero means to train on all items.
      #
      # This limit and `text-analysis.max-length` define how much
      # memory is required. On weaker hardware, it is advised to play
      # with these values.
      item-count = 600

      # These settings are used to configure the classifier. If
      # multiple are given, they are all tried and the "best" is
      # chosen at the end. See
      # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
      # for more info about these settings. The settings here yielded
      # good results with *my* dataset.
      #
      # Enclose regexps in triple quotes.
      classifiers = [
        { "useSplitWords" = "true"
          "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
          "splitWordsIgnoreRegexp" = """\s+"""
          "useSplitPrefixSuffixNGrams" = "true"
          "maxNGramLeng" = "4"
          "minNGramLeng" = "1"
          "splitWordShape" = "chris4"
          "intern" = "true" # makes it slower but saves memory
        }
      ]
    }
  }

  # Configuration for converting files into PDFs.
  #
  # Most of it is delegated to external tools, which can be configured
  # below. They must be in the PATH environment or specify the full
  # path below via the `program` key.
  convert {

    # The chunk size used when storing files. This should be the same
    # as used with the rest server.
    chunk-size = 2097152
#${docspell.joex.files.chunk-size}

    # A string used to change the filename of the converted pdf file.
    # If empty, the original file name is used for the pdf file ( the
    # extension is always replaced with `pdf`).
    converted-filename-part = "converted"

    # When reading images, this is the maximum size. Images that are
    # larger are not processed.
    max-image-size = ${docspell.joex.extraction.ocr.max-image-size}

    # Settings when processing markdown files (and other text files)
    # to HTML.
    #
    # In order to support text formats, text files are first converted
    # to HTML using a markdown processor. The resulting HTML is then
    # converted to a PDF file.
    markdown {

      # The CSS that is used to style the resulting HTML.
      internal-css = """
        body { padding: 2em 5em; }
      """
    }

    # Which HTML->PDF converter command to use. One of: wkhtmlpdf,
    # weasyprint.
    html-converter = "wkhtmlpdf"

    # To convert HTML files into PDF files, the external tool
    # wkhtmltopdf is used.
    wkhtmlpdf {
      command = {
        program = "wkhtmltopdf"
        args = [
          "-s",
          "A4",
          "--encoding",
          "{{encoding}}",
          "--load-error-handling", "ignore",
          "--load-media-error-handling", "ignore",
          "-",
          "{{outfile}}"
        ]
        timeout = "10 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
    }

    # An alternative to wkhtmltopdf is weasyprint.
    weasyprint {
      command = {
        program = "weasyprint"
        args = [
          "--optimize-size", "all",
          "--encoding", "{{encoding}}",
          "-",
          "{{outfile}}"
        ]
        timeout = "10 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
    }

    # To convert image files to PDF files, tesseract is used. This
    # also extracts the text in one go.
    tesseract = {
      command = {
        program = "tesseract"
        args = [
          "{{infile}}",
          "out",
          "-l",
          "{{lang}}",
          "pdf",
          "txt"
        ]
        timeout = "10 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

    # To convert "office" files to PDF files, the external tool
    # unoconv is used. Unoconv uses libreoffice/openoffice for
    # converting. So it supports all formats that are possible to read
    # with libreoffice/openoffic.
    #
    # Note: to greatly improve performance, it is recommended to start
    # a libreoffice listener by running `unoconv -l` in a separate
    # process.
    unoconv = {
      command = {
        program = "unoconv"
        args = [
          "-f",
          "pdf",
          "-o",
          "{{outfile}}",
          "{{infile}}"
        ]
        timeout = "10 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

    # The tool ocrmypdf can be used to convert pdf files to pdf files
    # in order to add extracted text as a separate layer. This makes
    # image-only pdfs searchable and you can select and copy/paste the
    # text. It also converts pdfs into pdf/a type pdfs, which are best
    # suited for archiving. So it makes sense to use this even for
    # text-only pdfs.
    #
    # It is recommended to install ocrympdf, but it also is optional.
    # If it is enabled but fails, the error is not fatal and the
    # processing will continue using the original pdf for extracting
    # text. You can also disable it to remove the errors from the
    # processing logs.
    #
    # The `--skip-text` option is necessary to not fail on "text" pdfs
    # (where ocr is not necessary). In this case, the pdf will be
    # converted to PDF/A.
    ocrmypdf = {
      enabled = true
      command = {
        program = "ocrmypdf"
        args = [
          "-l", "{{lang}}",
          "--skip-text",
          "--deskew",
          "-j", "1",
          "{{infile}}",
          "{{outfile}}"
        ]
        timeout = "10 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

    # Allows to try to decrypt a PDF with encryption or protection. If
    # enabled, a PDFs encryption or protection will be removed during
    # conversion.
    #
    # For encrypted PDFs, this is necessary to be processed, because
    # docspell needs to read it. It also requires to specify a
    # password here. All passwords are tried when reading a PDF.
    #
    # This is enabled by default with an empty password list. This
    # removes protection from PDFs, which is better for processing.
    #
    # Passwords can be given here and each collective can maintain
    # their passwords as well. But if the `enabled` setting below is
    # `false`, then no attempt at decrypting is done.
    decrypt-pdf = {
      enabled = true
      passwords = []
    }
  }

  # The same section is also present in the rest-server config. It is
  # used when submitting files into the job queue for processing.
  #
  # Currently, these settings may affect memory usage of all nodes, so
  # it should be the same on all nodes.
  files {
    # Defines the chunk size (in bytes) used to store the files.
    # This will affect the memory footprint when uploading and
    # downloading files. At most this amount is loaded into RAM for
    # down- and uploading.
    #
    # It also defines the chunk size used for the blobs inside the
    # database.
    chunk-size = 524288

    # The file content types that are considered valid. Docspell
    # will only pass these files to processing. The processing code
    # itself has also checks for which files are supported and which
    # not. This affects the uploading part and can be used to
    # restrict file types that should be handed over to processing.
    # By default all files are allowed.
    valid-mime-types = [ ]

    # The id of an enabled store from the `stores` array that should
    # be used.
    #
    # IMPORTANT NOTE: All nodes must have the exact same file store
    # configuration!
    default-store = "database"

    # A list of possible file stores. Each entry must have a unique
    # id. The `type` is one of: default-database, filesystem, s3.
    #
    # The enabled property serves currently to define target stores
    # for te "copy files" task. All stores with enabled=false are
    # removed from the list. The `default-store` must be enabled.
    stores = {
      database =
        { enabled = true
          type = "default-database"
        }

      filesystem =
        { enabled = false
          type = "file-system"
          directory = "/some/directory"
        }

      minio =
       { enabled = false
         type = "s3"
         endpoint = "http://localhost:9000"
         access-key = "username"
         secret-key = "password"
         bucket = "docspell"
       }
    }
  }

  # Configuration of the full-text search engine. (the same must be used for restserver)
  full-text-search {
    # The full-text search feature can be disabled. It requires an
    # additional index server which needs additional memory and disk
    # space. It can be enabled later any time.
    #
    # Currently the SOLR search platform and PostgreSQL is supported.
    enabled = false

    # Which backend to use, either solr or postgresql
    backend = "solr"

    # Configuration for the SOLR backend.
    solr = {
      # The URL to solr
      url = "http://localhost:8983/solr/docspell"
      # Used to tell solr when to commit the data
      commit-within = 1000
      # If true, logs request and response bodies
      log-verbose = false
      # The defType parameter to lucene that defines the parser to
      # use. You might want to try "edismax" or look here:
      # https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
      def-type = "lucene"
      # The default combiner for tokens. One of {AND, OR}.
      q-op = "OR"
    }

    # Configuration for PostgreSQL backend
    postgresql = {
      # Whether to use the default database, only works if it is
      # postgresql
      use-default-connection = false

      # The database connection.
      jdbc {
        url = "jdbc:postgresql://db:5432/dbname"
        user = "dbuser"
        password = "dbpass"
      }

      # A mapping from a language to a postgres text search config. By
      # default a language is mapped to a predefined config.
      # PostgreSQL has predefined configs for some languages. This
      # setting allows to create a custom text search config and
      # define it here for some or all languages.
      #
      # Example:
      #  { german = "my-german" }
      #
      # See https://www.postgresql.org/docs/14/textsearch-tables.html ff.
      pg-config = {
      }

      # Define which query parser to use.
      #
      # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
      pg-query-parser = "websearch_to_tsquery"

      # Allows to define a normalization for the ranking.
      #
      # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING
      pg-rank-normalization = [ 4 ]
    }

    # Settings for running the index migration tasks
    migration = {
      # Chunk size to use when indexing data from the database. This
      # many attachments are loaded into memory and pushed to the
      # full-text index.
      index-all-chunk = 10
    }
  }

  addons {
    # A directory to extract addons when running them. Everything in
    # here will be cleared after each run.
    working-dir = ${java.io.tmpdir}"/docspell-addons"

    # A directory for addons to store data between runs. This is not
    # cleared by Docspell and can get large depending on the addons
    # executed.
    #
    # This directory is used as base. In it subdirectories are created
    # per run configuration id.
    cache-dir = ${java.io.tmpdir}"/docspell-addon-cache"

    executor-config {
      # Define a (comma or whitespace separated) list of runners that
      # are responsible for executing an addon. This setting is
      # compared to what is supported by addons. Possible values are:
      #
      # - nix-flake: use nix-flake runner if the addon supports it
      #   (this requires the nix package manager on the joex machine)
      # - docker: use docker
      # - trivial: use the trivial runner
      #
      # The first successful execution is used. This should list all
      # runners the computer supports.
      runner = "nix-flake, docker, trivial"

      # systemd-nspawn can be used to run the program in a container.
      # This is used by runners nix-flake and trivial.
      nspawn = {
        # If this is false, systemd-nspawn is not tried. When true, the
        # addon is executed inside a lightweight container via
        # systemd-nspawn.
        enabled = false

        # Path to sudo command. By default systemd-nspawn is executed
        # via sudo - the user running joex must be allowed to do so NON
        # INTERACTIVELY. If this is empty, then nspawn is tried to
        # execute without sudo.
        sudo-binary = "sudo"

        # Path to the systemd-nspawn command.
        nspawn-binary = "systemd-nspawn"

        # Workaround, if multiple same named containers are run too fast
        container-wait = "100 millis"
      }

      # When multiple addons are executed sequentially, stop after the
      # first failing result. If this is false, then subsequent addons
      # will be run for their side effects only.
      fail-fast = true

      # The timeout for running an addon.
      run-timeout = "15 minutes"

      # Configure the nix flake runner.
      nix-runner {
        # Path to the nix command.
        nix-binary = "nix"

        # The timeout for building the package (running nix build).
        build-timeout = "15 minutes"
      }

      # Configure the docker runner
      docker-runner {
        # Path to the docker command.
        docker-binary = "docker"

        # The timeout for building the package (running docker build).
        build-timeout = "15 minutes"
      }
    }
  }
}