Files
docker/config/docspell.conf

1422 lines
49 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

docspell.server {
# This is shown in the top right corner of the web application
app-name = "Docspell"
# This is the id of this node. If you run more than one server, you
# have to make sure to provide unique ids per node.
app-id = "rest1"
# This is the base URL this application is deployed to. This is used
# to create absolute URLs and to configure the cookie.
#
# If default is not changed, the HOST line of the login request is
# used instead or the value of the `X-Forwarded-For` header. If set
# to some other value, the request is not inspected.
base-url = "https://docs.pukeko.xyz"
# This url is the base url for reaching this server internally.
# While you might set `base-url` to some external address (like
# mydocs.myserver.com), the `internal-url` must be set such that
# other nodes can reach this server.
internal-url = "http://restserver:7880"
# Configures logging
logging {
# The format for the log messages. Can be one of:
# Json, Logfmt, Fancy or Plain
format = "Fancy"
# The minimum level to log. From lowest to highest:
# Trace, Debug, Info, Warn, Error
minimum-level = "Warn"
# Override the log level of specific loggers
levels = {
"docspell" = "Info"
"org.flywaydb" = "Info"
"binny" = "Info"
"org.http4s" = "Info"
}
}
# Where the server binds to.
bind {
address = "0.0.0.0"
port = 7880
}
# Options for tuning the http server
server-options {
enable-http-2 = false
# Maximum allowed connections
max-connections = 1024
# Timeout for waiting for the first output of the response
response-timeout = 45s
}
# This is a hard limit to restrict the size of a batch that is
# returned when searching for items. The user can set this limit
# within the client config, but it is restricted by the server to
# the number defined here. An admin might choose a lower number
# depending on the available resources.
max-item-page-size = 200
# The number of characters to return for each item notes when
# searching. Item notes may be very long, when returning them with
# all the results from a search, they add quite some data to return.
# In order to keep this low, a limit can be defined here.
max-note-length = 180
# This defines whether the classification form in the collective
# settings is displayed or not. If all joex instances have document
# classification disabled, it makes sense to hide its settings from
# users.
show-classification-settings = true
# Authentication.
auth {
# The secret for this server that is used to sign the authenicator
# tokens. If multiple servers are running, all must share the same
# secret. You can use base64 or hex strings (prefix with b64: and
# hex:, respectively). If empty, a random secret is generated.
# Example: b64:YRx77QujCGkHSvll0TVEmtTaw3Z5eXr+nWMsEJowgKg=
server-secret = ""
# How long an authentication token is valid. The web application
# will get a new one periodically.
session-valid = "5 minutes"
remember-me {
enabled = true
# How long the remember me cookie/token is valid.
valid = "30 days"
}
# One of: fail, convert
#
# Accounts can be local or defined at a remote provider and
# integrated via OIDC. If the same account is defined in both
# sources, docspell by default fails if a user mixes logins (e.g.
# when registering a user locally and then logging in with the
# same user via OIDC). When set to `convert` docspell treats it as
# being the same and simply updates the account to reflect the new
# account source.
on-account-source-conflict = "fail"
}
# Settings for "download as zip"
download-all {
# How many files to allow in a zip.
max-files = 500
# The maximum (uncompressed) size of the zip file contents.
max-size = 1400M
}
# Configures OpenID Connect (OIDC) or OAuth2 authentication. Only
# the "Authorization Code Flow" is supported.
#
# Multiple authentication providers can be defined. Each is
# configured in the array below. The `provider` block gives all
# details necessary to authenticate against an external OIDC or
# OAuth provider. This requires at least two URLs for OIDC and three
# for OAuth2. When using OIDC, the `user-url` is only required if
# the account data is to be retrieved from the user-info endpoint
# and not from the JWT token. For the request to the `user-url`, the
# access token is then used to authenticate at the provider. Thus,
# it doesn't need to be validated here and therefore no `sign-key`
# setting is needed. However, if you want to extract the account
# information from the access token, it must be validated here and
# therefore the correct signature key and algorithm must be
# provided. If the `sign-key` is left empty, the `user-url` is used
# and must be specified. If the `sign-key` is _not_ empty, the
# response from the authentication provider is validated using this
# key.
#
# If a `logout-url` is provided, it will be used to finally redirect
# the browser to this url that should logout the user from Docspell
# at the provider.
#
# After successful authentication, docspell needs to create the
# account. For this a username and collective name is required. The
# account name is defined by the `user-key` and `collective-key`
# setting. The `user-key` is used to search the JSON structure, that
# is obtained from the JWT token or the user-info endpoint, for the
# login name to use. It traverses the JSON structure recursively,
# until it finds an object with that key. The first value is used.
#
# The `collective-key` can be used in multiple ways and both can
# work together to retrieve the full account id:
#
# - If it starts with `fixed:`, like "fixed:collective", the name
# after the `fixed:` prefix is used as collective as is. So all
# users are in the same collective.
#
# - If it starts with `lookup:`, like "lookup:collective_name", the
# value after the prefix is used to search the JSON response for
# an object with this key, just like it works with the `user-key`.
#
# - If it starts with `account:`, like "account:demo", it works the
# same as `lookup:` only that the value is interpreted as the full
# account name of form `collective/login`. The `user-key` value is
# ignored in this case.
#
# If these values cannot be obtained from the response, docspell
# fails the authentication. It is then assumed that the successfully
# authenticated user at the OP has not enough permissions to access
# docspell.
#
# Below are examples for OpenID Connect (keycloak) and OAuth2
# (github).
openid =
[ { enabled = false,
# The name to render on the login link/button.
display = "Keycloak"
# This illustrates to use a custom keycloak setup as the
# authentication provider. For details, please refer to the
# keycloak documentation. The settings here assume a certain
# configuration at keycloak.
#
# Keycloak can be configured to return the collective name for
# each user in the access token. It may also be configured to
# return it in the user info response. If it is already in the
# access token, an additional request can be omitted. Set the
# `sign-key` to an empty string then. Otherwise provide the
# algo and key from your realm settings. In this example, the
# realm is called "home".
provider = {
provider-id = "keycloak",
client-id = "docspell",
client-secret = "example-secret-439e-bf06-911e4cdd56a6",
scope = "profile", # scope is required for OIDC
authorize-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/auth",
token-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/token",
#User URL is not used when signature key is set.
#user-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/userinfo",
logout-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/logout"
sign-key = "b64:anVzdC1hLXRlc3Q=",
sig-algo = "RS512"
},
# The collective of the user is given in the access token as
# property `docspell_collective`.
collective-key = "lookup:docspell_collective",
# The username to use for the docspell account
user-key = "preferred_username"
},
{ enabled = true,
# The name to render on the login link/button.
display = "Authelia"
provider = {
provider-id = "authelia",
client-id = "docspell",
client-secret = "tEf47Me$YsXG8K4%63$%!kbMqbgVnc*bAq2i4SPERay#T!&ajc35m&D%C#uRMiaSv@cRFxwMcqo%SwEq*49G9HufJ&d#^f*&MK9hzU6s&7C2^XmfGC8Up7YeegnH#VhP",
scope = "openid profile groups email", # scope is not needed for github
authorize-url = "https://auth.pukeko.xyz/api/oidc/authorize",
token-url = "https://auth.pukeko.xyz/api/oidc/token",
user-url = "https://auth.pukeko.xyz/api/oidc/userinfo",
sign-key = "" # this must be set empty
sig-algo = "RS256" #unused but must be set to something
},
# If the authentication provider doesn't provide the
# collective name, simply use a fixed one. This means all
# users from this provider are in the same collective!
collective-key = "fixed:shmick",
# Github provides the login name via the `login` property as
# response from the user-url. This value is used to construct
# the account in docspell.
user-key = "preferred_username"
}
]
# When exactly one OIDC/OAuth provider is configured, then the weapp
# automatically redirects to its authentication page skipping the
# docspell login page.
oidc-auto-redirect = true
# This endpoint allows to upload files to any collective. The
# intention is that local software integrates with docspell more
# easily. Therefore the endpoint is not protected by the usual
# means.
#
# For security reasons, this endpoint is disabled by default. If
# enabled, you can choose from some ways to protect it. It may be a
# good idea to further protect this endpoint using a firewall, such
# that outside traffic is not routed.
#
# NOTE: If all protection methods are disabled, the endpoint is not
# protected at all!
integration-endpoint {
enabled = false
# The priority to use when submitting files through this endpoint.
priority = "low"
# The name used for the item "source" property when uploaded
# through this endpoint.
source-name = "integration"
# IPv4 addresses to allow access. An empty list, if enabled,
# prohibits all requests. IP addresses may be specified as simple
# globs: a part marked as `*' matches any octet, like in
# `192.168.*.*`. The `127.0.0.1' (the default) matches the
# loopback address.
allowed-ips {
enabled = false
ips = [ "127.0.0.1" ]
}
# Requests are expected to use http basic auth when uploading
# files.
http-basic {
enabled = false
realm = "Docspell Integration"
user = "docspell-int"
password = "docspell-int"
}
# Requests are expected to supply some specific header when
# uploading files.
http-header {
enabled = false
header-name = "Docspell-Integration"
header-value = "some-secret"
}
}
# This is a special endpoint that allows some basic administration.
#
# It is intended to be used by admins only, that is users who
# installed the app and have access to the system. Normal users
# should not have access and therefore a secret must be provided in
# order to access it.
#
# This is used for some endpoints, for example:
# - re-create complete fulltext index:
# curl -XPOST -H'Docspell-Admin-Secret: xyz' http://docspell-restserver:7880/api/v1/admin/fts/reIndexAll
admin-endpoint {
# The secret. If empty, the endpoint is disabled.
secret = ""
}
# Configuration of the full-text search engine. (the same must be used for joex)
full-text-search {
# The full-text search feature can be disabled. It requires an
# additional index server which needs additional memory and disk
# space. It can be enabled later any time.
#
# Currently the SOLR search platform and PostgreSQL is supported.
enabled = false
# Which backend to use, either solr or postgresql
backend = "solr"
# Configuration for the SOLR backend.
solr = {
# The URL to solr
url = "http://localhost:8983/solr/docspell"
# Used to tell solr when to commit the data
commit-within = 1000
# If true, logs request and response bodies
log-verbose = false
# The defType parameter to lucene that defines the parser to
# use. You might want to try "edismax" or look here:
# https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
def-type = "lucene"
# The default combiner for tokens. One of {AND, OR}.
q-op = "OR"
}
# Configuration for PostgreSQL backend
postgresql = {
# Whether to use the default database, only works if it is
# postgresql
use-default-connection = false
# The database connection.
jdbc {
url = "jdbc:postgresql://server:5432/db"
user = "pguser"
password = ""
}
# A mapping from a language to a postgres text search config. By
# default a language is mapped to a predefined config.
# PostgreSQL has predefined configs for some languages. This
# setting allows to create a custom text search config and
# define it here for some or all languages.
#
# Example:
# { german = "my-german" }
#
# See https://www.postgresql.org/docs/14/textsearch-tables.html ff.
pg-config = {
}
# Define which query parser to use.
#
# https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
pg-query-parser = "websearch_to_tsquery"
# Allows to define a normalization for the ranking.
#
# https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING
pg-rank-normalization = [ 4 ]
}
}
# Configuration for the backend.
backend {
# Enable or disable debugging for e-mail related functionality. This
# applies to both sending and receiving mails. For security reasons
# logging is not very extensive on authentication failures. Setting
# this to true, results in a lot of data printed to stdout.
mail-debug = false
# The database connection.
jdbc {
# The JDBC url to the database. By default a H2 file-based
# database is configured. You can provide a postgresql or
# mariadb connection here. When using H2 use the PostgreSQL
# compatibility mode and AUTO_SERVER feature.
#url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
url = "jdbc:postgresql://db:5432/dbname"
# The database user.
user = "dbuser"
# The database password.
password = "dbpass"
}
# Additional settings related to schema migration.
database-schema = {
# Whether to run main database migrations.
run-main-migrations = true
# Whether to run the fixup migrations.
run-fixup-migrations = true
# Use with care. This repairs all migrations in the database by
# updating their checksums and removing failed migrations. Good
# for testing, not recommended for normal operation.
repair-schema = false
}
# Configuration for registering new users.
signup {
# The mode defines if new users can signup or not. It can have
# three values:
#
# - open: every new user can sign up
# - invite: new users can sign up only if they provide a correct
# invitation key. Invitation keys can be generated by the
# server.
# - closed: signing up is disabled.
mode = "open"
# If mode == 'invite', a password must be provided to generate
# invitation keys. It must not be empty.
new-invite-password = ""
# If mode == 'invite', this is the period an invitation token is
# considered valid.
invite-time = "3 days"
}
files {
# Defines the chunk size (in bytes) used to store the files.
# This will affect the memory footprint when uploading and
# downloading files. At most this amount is loaded into RAM for
# down- and uploading.
#
# It also defines the chunk size used for the blobs inside the
# database.
chunk-size = 2097152
# The file content types that are considered valid. Docspell
# will only pass these files to processing. The processing code
# itself has also checks for which files are supported and which
# not. This affects the uploading part and can be used to
# restrict file types that should be handed over to processing.
# By default all files are allowed.
valid-mime-types = [ ]
# The id of an enabled store from the `stores` array that should
# be used.
#
# IMPORTANT NOTE: All nodes must have the exact same file store
# configuration!
default-store = "database"
# A list of possible file stores. Each entry must have a unique
# id. The `type` is one of: default-database, filesystem, s3.
#
# The enabled property serves currently to define target stores
# for te "copy files" task. All stores with enabled=false are
# removed from the list. The `default-store` must be enabled.
stores = {
database =
{ enabled = true
type = "default-database"
}
filesystem =
{ enabled = false
type = "file-system"
directory = "/some/directory"
}
minio =
{ enabled = false
type = "s3"
endpoint = "http://localhost:9000"
access-key = "username"
secret-key = "password"
bucket = "docspell"
}
}
}
addons = {
enabled = false
# Whether installing addons requiring network should be allowed
# or not.
allow-impure = true
# Define patterns of urls that are allowed to install addons
# from.
#
# A pattern is compared against an URL by comparing three parts
# of an URL via globs: scheme, host and path.
#
# You can use '*' (0 or more) and '?' (one) as wildcards in each
# part. For example:
#
# https://*.mydomain.com/projects/*
# *s://gitea.mydomain/*
#
# A hostname is separated by dots and the path by a slash. A '*'
# in a pattern means to match one or more characters. The path
# pattern is always matching the given prefix. So /a/b/* matches
# /a/b/c and /a/b/c/d and all other sub-paths.
#
# Multiple patterns can be defined va a comma separated string
# or as an array. An empty string matches no URL, while the
# special pattern '*' all by itself means to match every URL.
allowed-urls = "*"
# Same as `allowed-urls` but a match here means do deny addons
# from this url.
denied-urls = ""
}
}
}
docspell.joex {
# This is the id of this node. If you run more than one server, you
# have to make sure to provide unique ids per node.
app-id = "joex1"
# This is the base URL this application is deployed to. This is used
# to register this joex instance such that docspell rest servers can
# reach them
base-url = "http://docspell-joex:7878"
# Where the REST server binds to.
#
# JOEX provides a very simple REST interface to inspect its state.
bind {
address = "0.0.0.0"
port = 7878
}
# Configures logging
logging {
# The format for the log messages. Can be one of:
# Json, Logfmt, Fancy or Plain
format = "Fancy"
# The minimum level to log. From lowest to highest:
# Trace, Debug, Info, Warn, Error
minimum-level = "Warn"
# Override the log level of specific loggers
levels = {
"docspell" = "Info"
"org.flywaydb" = "Info"
"binny" = "Info"
"org.http4s" = "Info"
}
}
# The database connection.
#
# It must be the same connection as the rest server is using.
jdbc {
# The JDBC url to the database. By default a H2 file-based
# database is configured. You can provide a postgresql or mariadb
# connection here. When using H2 use the PostgreSQL compatibility
# mode and AUTO_SERVER feature.
#url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
url = "jdbc:postgresql://db:5432/dbname"
# The database user.
user = "dbuser"
# The database password.
password = "dbpass"
}
# Additional settings related to schema migration.
database-schema = {
# Whether to run main database migrations.
run-main-migrations = true
# Whether to run the fixup migrations.
run-fixup-migrations = true
# Use with care. This repairs all migrations in the database by
# updating their checksums and removing failed migrations. Good
# for testing, not recommended for normal operation.
repair-schema = false
}
# Enable or disable debugging for e-mail related functionality. This
# applies to both sending and receiving mails. For security reasons
# logging is not very extensive on authentication failures. Setting
# this to true, results in a lot of data printed to stdout.
mail-debug = false
send-mail {
# This is used as the List-Id e-mail header when mails are sent
# from docspell to its users (example: for notification mails). It
# is not used when sending to external recipients. If it is empty,
# no such header is added. Using this header is often useful when
# filtering mails.
#
# It should be a string in angle brackets. See
# https://tools.ietf.org/html/rfc2919 for a formal specification
# of this header.
list-id = ""
}
# Configuration for the job scheduler.
scheduler {
# Each scheduler needs a unique name. This defaults to the node
# name, which must be unique, too.
name = ${docspell.joex.app-id}
# Number of processing allowed in parallel.
pool-size = 1
# A counting scheme determines the ratio of how high- and low-prio
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
# 1 low prio and then start over.
counting-scheme = "4,1"
# How often a failed job should be retried until it enters failed
# state. If a job fails, it becomes "stuck" and will be retried
# after a delay.
retries = 2
# The delay until the next try is performed for a failed job. This
# delay is increased exponentially with the number of retries.
retry-delay = "1 minute"
# The queue size of log statements from a job.
log-buffer-size = 500
# If no job is left in the queue, the scheduler will wait until a
# notify is requested (using the REST interface). To also retry
# stuck jobs, it will notify itself periodically.
wakeup-period = "30 minutes"
}
periodic-scheduler {
# Each scheduler needs a unique name. This defaults to the node
# name, which must be unique, too.
name = ${docspell.joex.app-id}
# A fallback to start looking for due periodic tasks regularily.
# Usually joex instances should be notified via REST calls if
# external processes change tasks. But these requests may get
# lost.
wakeup-period = "10 minutes"
}
# Configuration for the user-tasks.
user-tasks {
# Allows to import e-mails by scanning a mailbox.
scan-mailbox {
# A limit of how many folders to scan through. If a user
# configures more than this, only upto this limit folders are
# scanned and a warning is logged.
max-folders = 50
# How many mails (headers only) to retrieve in one chunk.
#
# If this is greater than `max-mails' it is set automatically to
# the value of `max-mails'.
mail-chunk-size = 50
# A limit on how many mails to process in one job run. This is
# meant to avoid too heavy resource allocation to one
# user/collective.
#
# If more than this number of mails is encountered, a warning is
# logged.
max-mails = 500
}
}
# Docspell uses periodic house keeping tasks, like cleaning expired
# invites, that can be configured here.
house-keeping {
# When the house keeping tasks execute. Default is to run every
# week.
schedule = "Sun *-*-* 00:00:00 UTC"
# This task removes invitation keys that have been created but not
# used. The timespan here must be greater than the `invite-time'
# setting in the rest server config file.
cleanup-invites = {
# Whether this task is enabled.
enabled = true
# The minimum age of invites to be deleted.
older-than = "30 days"
}
# This task removes expired remember-me tokens. The timespan
# should be greater than the `valid` time in the restserver
# config.
cleanup-remember-me = {
# Whether the job is enabled.
enabled = true
# The minimum age of tokens to be deleted.
older-than = "30 days"
}
# Jobs store their log output in the database. Normally this data
# is only interesting for some period of time. The processing logs
# of old files can be removed eventually.
cleanup-jobs = {
# Whether this task is enabled.
enabled = true
# The minimum age of jobs to delete. It is matched against the
# `finished' timestamp.
older-than = "30 days"
# This defines how many jobs are deleted in one transaction.
# Since the data to delete may get large, it can be configured
# whether more or less memory should be used.
delete-batch = "100"
}
# Zip files created for downloading multiple files are cached and
# can be cleared periodically.
cleanup-downloads = {
# Whether to enable clearing old download archives.
enabled = true
# The minimum age of a download file to be deleted.
older-than = "14 days"
}
# Removes node entries that are not reachable anymore.
check-nodes {
# Whether this task is enabled
enabled = true
# How often the node must be unreachable, before it is removed.
min-not-found = 2
}
# Checks all files against their checksum
integrity-check {
enabled = true
}
}
# A periodic task to check for new releases of docspell. It can
# inform about a new release via e-mail. You need to specify an
# account that has SMTP settings to use for sending.
update-check {
# Whether to enable this task
enabled = false
# Sends the mail without checking the latest release. Can be used
# if you want to see if mail sending works, but don't want to wait
# until a new release is published.
test-run = false
# When the update check should execute. Default is to run every
# week. You can specify a time zone identifier, like
# 'Europe/Berlin' at the end.
schedule = "Sun *-*-* 00:00:00 UTC"
# An account id in form of `collective/user` (or just `user` if
# collective and user name are the same). This user account must
# have at least one valid SMTP settings which are used to send the
# mail.
sender-account = ""
# The SMTP connection id that should be used for sending the mail.
smtp-id = ""
# A list of recipient e-mail addresses.
# Example: `[ "john.doe@gmail.com" ]`
recipients = []
# The subject of the mail. It supports the same variables as the
# body.
subject = "Docspell {{ latestVersion }} is available"
# The body of the mail. Subject and body can contain these
# variables which are replaced:
#
# - `latestVersion` the latest available version of Docspell
# - `currentVersion` the currently running (old) version of Docspell
# - `releasedAt` a date when the release was published
#
# The body is processed as markdown after the variables have been
# replaced.
body = """
Hello,
You are currently running Docspell {{ currentVersion }}. Version *{{ latestVersion }}*
is now available, which was released on {{ releasedAt }}. Check the release page at:
<https://github.com/eikek/docspell/releases/latest>
Have a nice day!
Docpell Update Check
"""
}
# Configuration of text extraction
extraction {
# For PDF files it is first tried to read the text parts of the
# PDF. But PDFs can be complex documents and they may contain text
# and images. If the returned text is shorter than the value
# below, OCR is run afterwards. Then both extracted texts are
# compared and the longer will be used.
#
# If you set this to 0 (or a negative value), then the text parts
# of a PDF are ignored and OCR is always run and its result used.
pdf {
min-text-len = 500
}
preview {
# When rendering a pdf page, use this dpi. This results in
# scaling the image. A standard A4 page rendered at 96dpi
# results in roughly 790x1100px image. Using 32 results in
# roughly 200x300px image.
#
# Note, when this is changed, you might want to re-generate
# preview images. Check the api for this, there is an endpoint
# to regenerate all for a collective.
dpi = 32
}
# Extracting text using OCR works for image and pdf files. It will
# first run ghostscript to create a gray image from a pdf. Then
# unpaper is run to optimize the image for the upcoming ocr, which
# will be done by tesseract. All these programs must be available
# in your PATH or the absolute path can be specified below.
ocr {
# Images greater than this size are skipped. Note that every
# image is loaded completely into memory for doing OCR. This is
# the pixel count, `height * width` of the image.
max-image-size = 28000000
# Defines what pages to process. If a PDF with 600 pages is
# submitted, it is probably not necessary to scan through all of
# them. This would take a long time and occupy resources for no
# value. The first few pages should suffice. The default is first
# 10 pages.
#
# If you want all pages being processed, set this number to -1.
#
# Note: if you change the ghostscript command below, be aware that
# this setting (if not -1) will add another parameter to the
# beginning of the command.
page-range {
begin = 10
}
# The ghostscript command.
ghostscript {
command {
program = "gs"
args = [ "-dNOPAUSE"
, "-dBATCH"
, "-dSAFER"
, "-sDEVICE=tiffscaled8"
, "-sOutputFile={{outfile}}"
, "{{infile}}"
]
timeout = "5 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-extraction"
}
# The unpaper command.
unpaper {
command {
program = "unpaper"
args = [ "{{infile}}", "{{outfile}}" ]
timeout = "5 minutes"
}
}
# The tesseract command.
tesseract {
command {
program = "tesseract"
args = ["{{file}}"
, "stdout"
, "-l"
, "{{lang}}"
]
timeout = "5 minutes"
}
}
}
}
# Settings for text analysis
text-analysis {
# Maximum length of text to be analysed.
#
# All text to analyse must fit into RAM. A large document may take
# too much heap. Also, most important information is at the
# beginning of a document, so in most cases the first two pages
# should suffice. Default is 5000, which are about 2 pages (just a
# rough guess, of course). For my data, more than 80% of the
# documents are less than 5000 characters.
#
# This values applies to nlp and the classifier. If this value is
# <= 0, the limit is disabled.
max-length = 0
# A working directory for the analyser to store temporary/working
# files.
working-dir = ${java.io.tmpdir}"/docspell-analysis"
nlp {
# The mode for configuring NLP models:
#
# 1. full builds the complete pipeline
# 2. basic - builds only the ner annotator
# 3. regexonly - matches each entry in your address book via regexps
# 4. disabled - doesn't use any stanford-nlp feature
#
# The full and basic variants rely on pre-build language models
# that are available for only a few languages. Memory usage
# varies among the languages. So joex should run with -Xmx1400M
# at least when using mode=full.
#
# The basic variant does a quite good job for German and
# English. It might be worse for French, always depending on the
# type of text that is analysed. Joex should run with about 500M
# heap, here again lanugage German uses the most.
#
# The regexonly variant doesn't depend on a language. It roughly
# works by converting all entries in your addressbook into
# regexps and matches each one against the text. This can get
# memory intensive, too, when the addressbook grows large. This
# is included in the full and basic by default, but can be used
# independently by setting mode=regexner.
#
# When mode=disabled, then the whole nlp pipeline is disabled,
# and you won't get any suggestions. Only what the classifier
# returns (if enabled).
mode = full
# The StanfordCoreNLP library caches language models which
# requires quite some amount of memory. Setting this interval to a
# positive duration, the cache is cleared after this amount of
# idle time. Set it to 0 to disable it if you have enough memory,
# processing will be faster.
#
# This has only any effect, if mode != disabled.
clear-interval = "15 minutes"
# Restricts proposals for due dates. Only dates earlier than this
# number of years in the future are considered.
max-due-date-years = 10
regex-ner {
# Whether to enable custom NER annotation. This uses the
# address book of a collective as input for NER tagging (to
# automatically find correspondent and concerned entities). If
# the address book is large, this can be quite memory
# intensive and also makes text analysis much slower. But it
# improves accuracy and can be used independent of the
# lanugage. If this is set to 0, it is effectively disabled
# and NER tagging uses only statistical models (that also work
# quite well, but are restricted to the languages mentioned
# above).
#
# Note, this is only relevant if nlp-config.mode is not
# "disabled".
max-entries = 1000
# The NER annotation uses a file of patterns that is derived
# from a collective's address book. This is is the time how
# long this data will be kept until a check for a state change
# is done.
file-cache-time = "1 minute"
}
}
# Settings for doing document classification.
#
# This works by learning from existing documents. This requires a
# satstical model that is computed from all existing documents.
# This process is run periodically as configured by the
# collective. It may require more memory, depending on the amount
# of data.
#
# It utilises this NLP library: https://nlp.stanford.edu/.
classification {
# Whether to enable classification globally. Each collective can
# enable/disable auto-tagging. The classifier is also used for
# finding correspondents and concerned entities, if enabled
# here.
enabled = true
# If concerned with memory consumption, this restricts the
# number of items to consider. More are better for training. A
# negative value or zero means to train on all items.
#
# This limit and `text-analysis.max-length` define how much
# memory is required. On weaker hardware, it is advised to play
# with these values.
item-count = 600
# These settings are used to configure the classifier. If
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
# for more info about these settings. The settings here yielded
# good results with *my* dataset.
#
# Enclose regexps in triple quotes.
classifiers = [
{ "useSplitWords" = "true"
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
"splitWordsIgnoreRegexp" = """\s+"""
"useSplitPrefixSuffixNGrams" = "true"
"maxNGramLeng" = "4"
"minNGramLeng" = "1"
"splitWordShape" = "chris4"
"intern" = "true" # makes it slower but saves memory
}
]
}
}
# Configuration for converting files into PDFs.
#
# Most of it is delegated to external tools, which can be configured
# below. They must be in the PATH environment or specify the full
# path below via the `program` key.
convert {
# The chunk size used when storing files. This should be the same
# as used with the rest server.
chunk-size = 2097152
#${docspell.joex.files.chunk-size}
# A string used to change the filename of the converted pdf file.
# If empty, the original file name is used for the pdf file ( the
# extension is always replaced with `pdf`).
converted-filename-part = "converted"
# When reading images, this is the maximum size. Images that are
# larger are not processed.
max-image-size = ${docspell.joex.extraction.ocr.max-image-size}
# Settings when processing markdown files (and other text files)
# to HTML.
#
# In order to support text formats, text files are first converted
# to HTML using a markdown processor. The resulting HTML is then
# converted to a PDF file.
markdown {
# The CSS that is used to style the resulting HTML.
internal-css = """
body { padding: 2em 5em; }
"""
}
# Which HTML->PDF converter command to use. One of: wkhtmlpdf,
# weasyprint.
html-converter = "wkhtmlpdf"
# To convert HTML files into PDF files, the external tool
# wkhtmltopdf is used.
wkhtmlpdf {
command = {
program = "wkhtmltopdf"
args = [
"-s",
"A4",
"--encoding",
"{{encoding}}",
"--load-error-handling", "ignore",
"--load-media-error-handling", "ignore",
"-",
"{{outfile}}"
]
timeout = "10 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
}
# An alternative to wkhtmltopdf is weasyprint.
weasyprint {
command = {
program = "weasyprint"
args = [
"--optimize-size", "all",
"--encoding", "{{encoding}}",
"-",
"{{outfile}}"
]
timeout = "10 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
}
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = {
command = {
program = "tesseract"
args = [
"{{infile}}",
"out",
"-l",
"{{lang}}",
"pdf",
"txt"
]
timeout = "10 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# To convert "office" files to PDF files, the external tool
# unoconv is used. Unoconv uses libreoffice/openoffice for
# converting. So it supports all formats that are possible to read
# with libreoffice/openoffic.
#
# Note: to greatly improve performance, it is recommended to start
# a libreoffice listener by running `unoconv -l` in a separate
# process.
unoconv = {
command = {
program = "unoconv"
args = [
"-f",
"pdf",
"-o",
"{{outfile}}",
"{{infile}}"
]
timeout = "10 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# The tool ocrmypdf can be used to convert pdf files to pdf files
# in order to add extracted text as a separate layer. This makes
# image-only pdfs searchable and you can select and copy/paste the
# text. It also converts pdfs into pdf/a type pdfs, which are best
# suited for archiving. So it makes sense to use this even for
# text-only pdfs.
#
# It is recommended to install ocrympdf, but it also is optional.
# If it is enabled but fails, the error is not fatal and the
# processing will continue using the original pdf for extracting
# text. You can also disable it to remove the errors from the
# processing logs.
#
# The `--skip-text` option is necessary to not fail on "text" pdfs
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
enabled = true
command = {
program = "ocrmypdf"
args = [
"-l", "{{lang}}",
"--skip-text",
"--deskew",
"-j", "1",
"{{infile}}",
"{{outfile}}"
]
timeout = "10 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# Allows to try to decrypt a PDF with encryption or protection. If
# enabled, a PDFs encryption or protection will be removed during
# conversion.
#
# For encrypted PDFs, this is necessary to be processed, because
# docspell needs to read it. It also requires to specify a
# password here. All passwords are tried when reading a PDF.
#
# This is enabled by default with an empty password list. This
# removes protection from PDFs, which is better for processing.
#
# Passwords can be given here and each collective can maintain
# their passwords as well. But if the `enabled` setting below is
# `false`, then no attempt at decrypting is done.
decrypt-pdf = {
enabled = true
passwords = []
}
}
# The same section is also present in the rest-server config. It is
# used when submitting files into the job queue for processing.
#
# Currently, these settings may affect memory usage of all nodes, so
# it should be the same on all nodes.
files {
# Defines the chunk size (in bytes) used to store the files.
# This will affect the memory footprint when uploading and
# downloading files. At most this amount is loaded into RAM for
# down- and uploading.
#
# It also defines the chunk size used for the blobs inside the
# database.
chunk-size = 524288
# The file content types that are considered valid. Docspell
# will only pass these files to processing. The processing code
# itself has also checks for which files are supported and which
# not. This affects the uploading part and can be used to
# restrict file types that should be handed over to processing.
# By default all files are allowed.
valid-mime-types = [ ]
# The id of an enabled store from the `stores` array that should
# be used.
#
# IMPORTANT NOTE: All nodes must have the exact same file store
# configuration!
default-store = "database"
# A list of possible file stores. Each entry must have a unique
# id. The `type` is one of: default-database, filesystem, s3.
#
# The enabled property serves currently to define target stores
# for te "copy files" task. All stores with enabled=false are
# removed from the list. The `default-store` must be enabled.
stores = {
database =
{ enabled = true
type = "default-database"
}
filesystem =
{ enabled = false
type = "file-system"
directory = "/some/directory"
}
minio =
{ enabled = false
type = "s3"
endpoint = "http://localhost:9000"
access-key = "username"
secret-key = "password"
bucket = "docspell"
}
}
}
# Configuration of the full-text search engine. (the same must be used for restserver)
full-text-search {
# The full-text search feature can be disabled. It requires an
# additional index server which needs additional memory and disk
# space. It can be enabled later any time.
#
# Currently the SOLR search platform and PostgreSQL is supported.
enabled = false
# Which backend to use, either solr or postgresql
backend = "solr"
# Configuration for the SOLR backend.
solr = {
# The URL to solr
url = "http://localhost:8983/solr/docspell"
# Used to tell solr when to commit the data
commit-within = 1000
# If true, logs request and response bodies
log-verbose = false
# The defType parameter to lucene that defines the parser to
# use. You might want to try "edismax" or look here:
# https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
def-type = "lucene"
# The default combiner for tokens. One of {AND, OR}.
q-op = "OR"
}
# Configuration for PostgreSQL backend
postgresql = {
# Whether to use the default database, only works if it is
# postgresql
use-default-connection = false
# The database connection.
jdbc {
url = "jdbc:postgresql://db:5432/dbname"
user = "dbuser"
password = "dbpass"
}
# A mapping from a language to a postgres text search config. By
# default a language is mapped to a predefined config.
# PostgreSQL has predefined configs for some languages. This
# setting allows to create a custom text search config and
# define it here for some or all languages.
#
# Example:
# { german = "my-german" }
#
# See https://www.postgresql.org/docs/14/textsearch-tables.html ff.
pg-config = {
}
# Define which query parser to use.
#
# https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
pg-query-parser = "websearch_to_tsquery"
# Allows to define a normalization for the ranking.
#
# https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING
pg-rank-normalization = [ 4 ]
}
# Settings for running the index migration tasks
migration = {
# Chunk size to use when indexing data from the database. This
# many attachments are loaded into memory and pushed to the
# full-text index.
index-all-chunk = 10
}
}
addons {
# A directory to extract addons when running them. Everything in
# here will be cleared after each run.
working-dir = ${java.io.tmpdir}"/docspell-addons"
# A directory for addons to store data between runs. This is not
# cleared by Docspell and can get large depending on the addons
# executed.
#
# This directory is used as base. In it subdirectories are created
# per run configuration id.
cache-dir = ${java.io.tmpdir}"/docspell-addon-cache"
executor-config {
# Define a (comma or whitespace separated) list of runners that
# are responsible for executing an addon. This setting is
# compared to what is supported by addons. Possible values are:
#
# - nix-flake: use nix-flake runner if the addon supports it
# (this requires the nix package manager on the joex machine)
# - docker: use docker
# - trivial: use the trivial runner
#
# The first successful execution is used. This should list all
# runners the computer supports.
runner = "nix-flake, docker, trivial"
# systemd-nspawn can be used to run the program in a container.
# This is used by runners nix-flake and trivial.
nspawn = {
# If this is false, systemd-nspawn is not tried. When true, the
# addon is executed inside a lightweight container via
# systemd-nspawn.
enabled = false
# Path to sudo command. By default systemd-nspawn is executed
# via sudo - the user running joex must be allowed to do so NON
# INTERACTIVELY. If this is empty, then nspawn is tried to
# execute without sudo.
sudo-binary = "sudo"
# Path to the systemd-nspawn command.
nspawn-binary = "systemd-nspawn"
# Workaround, if multiple same named containers are run too fast
container-wait = "100 millis"
}
# When multiple addons are executed sequentially, stop after the
# first failing result. If this is false, then subsequent addons
# will be run for their side effects only.
fail-fast = true
# The timeout for running an addon.
run-timeout = "15 minutes"
# Configure the nix flake runner.
nix-runner {
# Path to the nix command.
nix-binary = "nix"
# The timeout for building the package (running nix build).
build-timeout = "15 minutes"
}
# Configure the docker runner
docker-runner {
# Path to the docker command.
docker-binary = "docker"
# The timeout for building the package (running docker build).
build-timeout = "15 minutes"
}
}
}
}