commit 6f6140e1a87a35a9621a59258b858263dc0353ab Author: Matan Horovitz Date: Sat Mar 18 16:31:43 2023 +0200 Inital commit - Docspell diff --git a/config/docspell.conf b/config/docspell.conf new file mode 100644 index 0000000..df521e0 --- /dev/null +++ b/config/docspell.conf @@ -0,0 +1,1421 @@ +docspell.server { + + # This is shown in the top right corner of the web application + app-name = "Docspell" + + # This is the id of this node. If you run more than one server, you + # have to make sure to provide unique ids per node. + app-id = "rest1" + + # This is the base URL this application is deployed to. This is used + # to create absolute URLs and to configure the cookie. + # + # If default is not changed, the HOST line of the login request is + # used instead or the value of the `X-Forwarded-For` header. If set + # to some other value, the request is not inspected. + base-url = "https://docs.pukeko.xyz" + + # This url is the base url for reaching this server internally. + # While you might set `base-url` to some external address (like + # mydocs.myserver.com), the `internal-url` must be set such that + # other nodes can reach this server. + internal-url = "http://restserver:7880" + + # Configures logging + logging { + # The format for the log messages. Can be one of: + # Json, Logfmt, Fancy or Plain + format = "Fancy" + + # The minimum level to log. From lowest to highest: + # Trace, Debug, Info, Warn, Error + minimum-level = "Warn" + + # Override the log level of specific loggers + levels = { + "docspell" = "Info" + "org.flywaydb" = "Info" + "binny" = "Info" + "org.http4s" = "Info" + } + } + + # Where the server binds to. + bind { + address = "0.0.0.0" + port = 7880 + } + + # Options for tuning the http server + server-options { + enable-http-2 = false + + # Maximum allowed connections + max-connections = 1024 + + # Timeout for waiting for the first output of the response + response-timeout = 45s + } + + # This is a hard limit to restrict the size of a batch that is + # returned when searching for items. The user can set this limit + # within the client config, but it is restricted by the server to + # the number defined here. An admin might choose a lower number + # depending on the available resources. + max-item-page-size = 200 + + # The number of characters to return for each item notes when + # searching. Item notes may be very long, when returning them with + # all the results from a search, they add quite some data to return. + # In order to keep this low, a limit can be defined here. + max-note-length = 180 + + # This defines whether the classification form in the collective + # settings is displayed or not. If all joex instances have document + # classification disabled, it makes sense to hide its settings from + # users. + show-classification-settings = true + + # Authentication. + auth { + + # The secret for this server that is used to sign the authenicator + # tokens. If multiple servers are running, all must share the same + # secret. You can use base64 or hex strings (prefix with b64: and + # hex:, respectively). If empty, a random secret is generated. + # Example: b64:YRx77QujCGkHSvll0TVEmtTaw3Z5eXr+nWMsEJowgKg= + server-secret = "" + + # How long an authentication token is valid. The web application + # will get a new one periodically. + session-valid = "5 minutes" + + remember-me { + enabled = true + # How long the remember me cookie/token is valid. + valid = "30 days" + } + + # One of: fail, convert + # + # Accounts can be local or defined at a remote provider and + # integrated via OIDC. If the same account is defined in both + # sources, docspell by default fails if a user mixes logins (e.g. + # when registering a user locally and then logging in with the + # same user via OIDC). When set to `convert` docspell treats it as + # being the same and simply updates the account to reflect the new + # account source. + on-account-source-conflict = "fail" + } + + # Settings for "download as zip" + download-all { + # How many files to allow in a zip. + max-files = 500 + + # The maximum (uncompressed) size of the zip file contents. + max-size = 1400M + } + + # Configures OpenID Connect (OIDC) or OAuth2 authentication. Only + # the "Authorization Code Flow" is supported. + # + # Multiple authentication providers can be defined. Each is + # configured in the array below. The `provider` block gives all + # details necessary to authenticate against an external OIDC or + # OAuth provider. This requires at least two URLs for OIDC and three + # for OAuth2. When using OIDC, the `user-url` is only required if + # the account data is to be retrieved from the user-info endpoint + # and not from the JWT token. For the request to the `user-url`, the + # access token is then used to authenticate at the provider. Thus, + # it doesn't need to be validated here and therefore no `sign-key` + # setting is needed. However, if you want to extract the account + # information from the access token, it must be validated here and + # therefore the correct signature key and algorithm must be + # provided. If the `sign-key` is left empty, the `user-url` is used + # and must be specified. If the `sign-key` is _not_ empty, the + # response from the authentication provider is validated using this + # key. + # + # If a `logout-url` is provided, it will be used to finally redirect + # the browser to this url that should logout the user from Docspell + # at the provider. + # + # After successful authentication, docspell needs to create the + # account. For this a username and collective name is required. The + # account name is defined by the `user-key` and `collective-key` + # setting. The `user-key` is used to search the JSON structure, that + # is obtained from the JWT token or the user-info endpoint, for the + # login name to use. It traverses the JSON structure recursively, + # until it finds an object with that key. The first value is used. + # + # The `collective-key` can be used in multiple ways and both can + # work together to retrieve the full account id: + # + # - If it starts with `fixed:`, like "fixed:collective", the name + # after the `fixed:` prefix is used as collective as is. So all + # users are in the same collective. + # + # - If it starts with `lookup:`, like "lookup:collective_name", the + # value after the prefix is used to search the JSON response for + # an object with this key, just like it works with the `user-key`. + # + # - If it starts with `account:`, like "account:demo", it works the + # same as `lookup:` only that the value is interpreted as the full + # account name of form `collective/login`. The `user-key` value is + # ignored in this case. + # + # If these values cannot be obtained from the response, docspell + # fails the authentication. It is then assumed that the successfully + # authenticated user at the OP has not enough permissions to access + # docspell. + # + # Below are examples for OpenID Connect (keycloak) and OAuth2 + # (github). + openid = + [ { enabled = false, + + # The name to render on the login link/button. + display = "Keycloak" + + # This illustrates to use a custom keycloak setup as the + # authentication provider. For details, please refer to the + # keycloak documentation. The settings here assume a certain + # configuration at keycloak. + # + # Keycloak can be configured to return the collective name for + # each user in the access token. It may also be configured to + # return it in the user info response. If it is already in the + # access token, an additional request can be omitted. Set the + # `sign-key` to an empty string then. Otherwise provide the + # algo and key from your realm settings. In this example, the + # realm is called "home". + provider = { + provider-id = "keycloak", + client-id = "docspell", + client-secret = "example-secret-439e-bf06-911e4cdd56a6", + scope = "profile", # scope is required for OIDC + authorize-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/auth", + token-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/token", + #User URL is not used when signature key is set. + #user-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/userinfo", + logout-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/logout" + sign-key = "b64:anVzdC1hLXRlc3Q=", + sig-algo = "RS512" + }, + # The collective of the user is given in the access token as + # property `docspell_collective`. + collective-key = "lookup:docspell_collective", + # The username to use for the docspell account + user-key = "preferred_username" + }, + { enabled = true, + + # The name to render on the login link/button. + display = "Authelia" + provider = { + provider-id = "authelia", + client-id = "docspell", + client-secret = "tEf47Me$YsXG8K4%63$%!kbMqbgVnc*bAq2i4SPERay#T!&ajc35m&D%C#uRMiaSv@cRFxwMcqo%SwEq*49G9HufJ&d#^f*&MK9hzU6s&7C2^XmfGC8Up7YeegnH#VhP", + scope = "openid profile groups email", # scope is not needed for github + authorize-url = "https://auth.pukeko.xyz/api/oidc/authorize", + token-url = "https://auth.pukeko.xyz/api/oidc/token", + user-url = "https://auth.pukeko.xyz/api/oidc/userinfo", + sign-key = "" # this must be set empty + sig-algo = "RS256" #unused but must be set to something + }, + + # If the authentication provider doesn't provide the + # collective name, simply use a fixed one. This means all + # users from this provider are in the same collective! + collective-key = "fixed:shmick", + + # Github provides the login name via the `login` property as + # response from the user-url. This value is used to construct + # the account in docspell. + user-key = "preferred_username" + } + ] + + # When exactly one OIDC/OAuth provider is configured, then the weapp + # automatically redirects to its authentication page skipping the + # docspell login page. + oidc-auto-redirect = true + + # This endpoint allows to upload files to any collective. The + # intention is that local software integrates with docspell more + # easily. Therefore the endpoint is not protected by the usual + # means. + # + # For security reasons, this endpoint is disabled by default. If + # enabled, you can choose from some ways to protect it. It may be a + # good idea to further protect this endpoint using a firewall, such + # that outside traffic is not routed. + # + # NOTE: If all protection methods are disabled, the endpoint is not + # protected at all! + integration-endpoint { + enabled = false + + # The priority to use when submitting files through this endpoint. + priority = "low" + + # The name used for the item "source" property when uploaded + # through this endpoint. + source-name = "integration" + + # IPv4 addresses to allow access. An empty list, if enabled, + # prohibits all requests. IP addresses may be specified as simple + # globs: a part marked as `*' matches any octet, like in + # `192.168.*.*`. The `127.0.0.1' (the default) matches the + # loopback address. + allowed-ips { + enabled = false + ips = [ "127.0.0.1" ] + } + + # Requests are expected to use http basic auth when uploading + # files. + http-basic { + enabled = false + realm = "Docspell Integration" + user = "docspell-int" + password = "docspell-int" + } + + # Requests are expected to supply some specific header when + # uploading files. + http-header { + enabled = false + header-name = "Docspell-Integration" + header-value = "some-secret" + } + } + + # This is a special endpoint that allows some basic administration. + # + # It is intended to be used by admins only, that is users who + # installed the app and have access to the system. Normal users + # should not have access and therefore a secret must be provided in + # order to access it. + # + # This is used for some endpoints, for example: + # - re-create complete fulltext index: + # curl -XPOST -H'Docspell-Admin-Secret: xyz' http://docspell-restserver:7880/api/v1/admin/fts/reIndexAll + admin-endpoint { + # The secret. If empty, the endpoint is disabled. + secret = "" + } + + # Configuration of the full-text search engine. (the same must be used for joex) + full-text-search { + # The full-text search feature can be disabled. It requires an + # additional index server which needs additional memory and disk + # space. It can be enabled later any time. + # + # Currently the SOLR search platform and PostgreSQL is supported. + enabled = false + + # Which backend to use, either solr or postgresql + backend = "solr" + + # Configuration for the SOLR backend. + solr = { + # The URL to solr + url = "http://localhost:8983/solr/docspell" + # Used to tell solr when to commit the data + commit-within = 1000 + # If true, logs request and response bodies + log-verbose = false + # The defType parameter to lucene that defines the parser to + # use. You might want to try "edismax" or look here: + # https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + def-type = "lucene" + # The default combiner for tokens. One of {AND, OR}. + q-op = "OR" + } + + # Configuration for PostgreSQL backend + postgresql = { + # Whether to use the default database, only works if it is + # postgresql + use-default-connection = false + + # The database connection. + jdbc { + url = "jdbc:postgresql://server:5432/db" + user = "pguser" + password = "" + } + + # A mapping from a language to a postgres text search config. By + # default a language is mapped to a predefined config. + # PostgreSQL has predefined configs for some languages. This + # setting allows to create a custom text search config and + # define it here for some or all languages. + # + # Example: + # { german = "my-german" } + # + # See https://www.postgresql.org/docs/14/textsearch-tables.html ff. + pg-config = { + } + + # Define which query parser to use. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES + pg-query-parser = "websearch_to_tsquery" + + # Allows to define a normalization for the ranking. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING + pg-rank-normalization = [ 4 ] + } + } + + # Configuration for the backend. + backend { + + # Enable or disable debugging for e-mail related functionality. This + # applies to both sending and receiving mails. For security reasons + # logging is not very extensive on authentication failures. Setting + # this to true, results in a lot of data printed to stdout. + mail-debug = false + + # The database connection. + jdbc { + # The JDBC url to the database. By default a H2 file-based + # database is configured. You can provide a postgresql or + # mariadb connection here. When using H2 use the PostgreSQL + # compatibility mode and AUTO_SERVER feature. + #url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" + url = "jdbc:postgresql://db:5432/dbname" + + # The database user. + user = "dbuser" + + # The database password. + password = "dbpass" + } + + # Additional settings related to schema migration. + database-schema = { + # Whether to run main database migrations. + run-main-migrations = true + + # Whether to run the fixup migrations. + run-fixup-migrations = true + + # Use with care. This repairs all migrations in the database by + # updating their checksums and removing failed migrations. Good + # for testing, not recommended for normal operation. + repair-schema = false + } + + # Configuration for registering new users. + signup { + + # The mode defines if new users can signup or not. It can have + # three values: + # + # - open: every new user can sign up + # - invite: new users can sign up only if they provide a correct + # invitation key. Invitation keys can be generated by the + # server. + # - closed: signing up is disabled. + mode = "open" + + # If mode == 'invite', a password must be provided to generate + # invitation keys. It must not be empty. + new-invite-password = "" + + # If mode == 'invite', this is the period an invitation token is + # considered valid. + invite-time = "3 days" + } + + files { + # Defines the chunk size (in bytes) used to store the files. + # This will affect the memory footprint when uploading and + # downloading files. At most this amount is loaded into RAM for + # down- and uploading. + # + # It also defines the chunk size used for the blobs inside the + # database. + chunk-size = 2097152 + + # The file content types that are considered valid. Docspell + # will only pass these files to processing. The processing code + # itself has also checks for which files are supported and which + # not. This affects the uploading part and can be used to + # restrict file types that should be handed over to processing. + # By default all files are allowed. + valid-mime-types = [ ] + + # The id of an enabled store from the `stores` array that should + # be used. + # + # IMPORTANT NOTE: All nodes must have the exact same file store + # configuration! + default-store = "database" + + # A list of possible file stores. Each entry must have a unique + # id. The `type` is one of: default-database, filesystem, s3. + # + # The enabled property serves currently to define target stores + # for te "copy files" task. All stores with enabled=false are + # removed from the list. The `default-store` must be enabled. + stores = { + database = + { enabled = true + type = "default-database" + } + + filesystem = + { enabled = false + type = "file-system" + directory = "/some/directory" + } + + minio = + { enabled = false + type = "s3" + endpoint = "http://localhost:9000" + access-key = "username" + secret-key = "password" + bucket = "docspell" + } + } + } + + addons = { + enabled = false + + # Whether installing addons requiring network should be allowed + # or not. + allow-impure = true + + # Define patterns of urls that are allowed to install addons + # from. + # + # A pattern is compared against an URL by comparing three parts + # of an URL via globs: scheme, host and path. + # + # You can use '*' (0 or more) and '?' (one) as wildcards in each + # part. For example: + # + # https://*.mydomain.com/projects/* + # *s://gitea.mydomain/* + # + # A hostname is separated by dots and the path by a slash. A '*' + # in a pattern means to match one or more characters. The path + # pattern is always matching the given prefix. So /a/b/* matches + # /a/b/c and /a/b/c/d and all other sub-paths. + # + # Multiple patterns can be defined va a comma separated string + # or as an array. An empty string matches no URL, while the + # special pattern '*' all by itself means to match every URL. + allowed-urls = "*" + + # Same as `allowed-urls` but a match here means do deny addons + # from this url. + denied-urls = "" + } + } +} +docspell.joex { + + # This is the id of this node. If you run more than one server, you + # have to make sure to provide unique ids per node. + app-id = "joex1" + + + # This is the base URL this application is deployed to. This is used + # to register this joex instance such that docspell rest servers can + # reach them + base-url = "http://docspell-joex:7878" + + # Where the REST server binds to. + # + # JOEX provides a very simple REST interface to inspect its state. + bind { + address = "0.0.0.0" + port = 7878 + } + + # Configures logging + logging { + # The format for the log messages. Can be one of: + # Json, Logfmt, Fancy or Plain + format = "Fancy" + + # The minimum level to log. From lowest to highest: + # Trace, Debug, Info, Warn, Error + minimum-level = "Warn" + + # Override the log level of specific loggers + levels = { + "docspell" = "Info" + "org.flywaydb" = "Info" + "binny" = "Info" + "org.http4s" = "Info" + } + } + + # The database connection. + # + # It must be the same connection as the rest server is using. + jdbc { + + # The JDBC url to the database. By default a H2 file-based + # database is configured. You can provide a postgresql or mariadb + # connection here. When using H2 use the PostgreSQL compatibility + # mode and AUTO_SERVER feature. + #url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" + url = "jdbc:postgresql://db:5432/dbname" + # The database user. + user = "dbuser" + # The database password. + password = "dbpass" + } + + # Additional settings related to schema migration. + database-schema = { + # Whether to run main database migrations. + run-main-migrations = true + + # Whether to run the fixup migrations. + run-fixup-migrations = true + + # Use with care. This repairs all migrations in the database by + # updating their checksums and removing failed migrations. Good + # for testing, not recommended for normal operation. + repair-schema = false + } + + # Enable or disable debugging for e-mail related functionality. This + # applies to both sending and receiving mails. For security reasons + # logging is not very extensive on authentication failures. Setting + # this to true, results in a lot of data printed to stdout. + mail-debug = false + + send-mail { + # This is used as the List-Id e-mail header when mails are sent + # from docspell to its users (example: for notification mails). It + # is not used when sending to external recipients. If it is empty, + # no such header is added. Using this header is often useful when + # filtering mails. + # + # It should be a string in angle brackets. See + # https://tools.ietf.org/html/rfc2919 for a formal specification + # of this header. + list-id = "" + } + + # Configuration for the job scheduler. + scheduler { + + # Each scheduler needs a unique name. This defaults to the node + # name, which must be unique, too. + name = ${docspell.joex.app-id} + + # Number of processing allowed in parallel. + pool-size = 1 + + # A counting scheme determines the ratio of how high- and low-prio + # jobs are run. For example: 4,1 means run 4 high prio jobs, then + # 1 low prio and then start over. + counting-scheme = "4,1" + + # How often a failed job should be retried until it enters failed + # state. If a job fails, it becomes "stuck" and will be retried + # after a delay. + retries = 2 + + # The delay until the next try is performed for a failed job. This + # delay is increased exponentially with the number of retries. + retry-delay = "1 minute" + + # The queue size of log statements from a job. + log-buffer-size = 500 + + # If no job is left in the queue, the scheduler will wait until a + # notify is requested (using the REST interface). To also retry + # stuck jobs, it will notify itself periodically. + wakeup-period = "30 minutes" + } + + periodic-scheduler { + + # Each scheduler needs a unique name. This defaults to the node + # name, which must be unique, too. + name = ${docspell.joex.app-id} + + # A fallback to start looking for due periodic tasks regularily. + # Usually joex instances should be notified via REST calls if + # external processes change tasks. But these requests may get + # lost. + wakeup-period = "10 minutes" + } + + # Configuration for the user-tasks. + user-tasks { + # Allows to import e-mails by scanning a mailbox. + scan-mailbox { + # A limit of how many folders to scan through. If a user + # configures more than this, only upto this limit folders are + # scanned and a warning is logged. + max-folders = 50 + + # How many mails (headers only) to retrieve in one chunk. + # + # If this is greater than `max-mails' it is set automatically to + # the value of `max-mails'. + mail-chunk-size = 50 + + # A limit on how many mails to process in one job run. This is + # meant to avoid too heavy resource allocation to one + # user/collective. + # + # If more than this number of mails is encountered, a warning is + # logged. + max-mails = 500 + } + } + + + # Docspell uses periodic house keeping tasks, like cleaning expired + # invites, that can be configured here. + house-keeping { + + # When the house keeping tasks execute. Default is to run every + # week. + schedule = "Sun *-*-* 00:00:00 UTC" + + # This task removes invitation keys that have been created but not + # used. The timespan here must be greater than the `invite-time' + # setting in the rest server config file. + cleanup-invites = { + + # Whether this task is enabled. + enabled = true + + # The minimum age of invites to be deleted. + older-than = "30 days" + } + + # This task removes expired remember-me tokens. The timespan + # should be greater than the `valid` time in the restserver + # config. + cleanup-remember-me = { + # Whether the job is enabled. + enabled = true + + # The minimum age of tokens to be deleted. + older-than = "30 days" + } + + # Jobs store their log output in the database. Normally this data + # is only interesting for some period of time. The processing logs + # of old files can be removed eventually. + cleanup-jobs = { + + # Whether this task is enabled. + enabled = true + + # The minimum age of jobs to delete. It is matched against the + # `finished' timestamp. + older-than = "30 days" + + # This defines how many jobs are deleted in one transaction. + # Since the data to delete may get large, it can be configured + # whether more or less memory should be used. + delete-batch = "100" + } + + # Zip files created for downloading multiple files are cached and + # can be cleared periodically. + cleanup-downloads = { + + # Whether to enable clearing old download archives. + enabled = true + + # The minimum age of a download file to be deleted. + older-than = "14 days" + } + + # Removes node entries that are not reachable anymore. + check-nodes { + # Whether this task is enabled + enabled = true + # How often the node must be unreachable, before it is removed. + min-not-found = 2 + } + + # Checks all files against their checksum + integrity-check { + enabled = true + } + } + + # A periodic task to check for new releases of docspell. It can + # inform about a new release via e-mail. You need to specify an + # account that has SMTP settings to use for sending. + update-check { + # Whether to enable this task + enabled = false + + # Sends the mail without checking the latest release. Can be used + # if you want to see if mail sending works, but don't want to wait + # until a new release is published. + test-run = false + + # When the update check should execute. Default is to run every + # week. You can specify a time zone identifier, like + # 'Europe/Berlin' at the end. + schedule = "Sun *-*-* 00:00:00 UTC" + + # An account id in form of `collective/user` (or just `user` if + # collective and user name are the same). This user account must + # have at least one valid SMTP settings which are used to send the + # mail. + sender-account = "" + + # The SMTP connection id that should be used for sending the mail. + smtp-id = "" + + # A list of recipient e-mail addresses. + # Example: `[ "john.doe@gmail.com" ]` + recipients = [] + + # The subject of the mail. It supports the same variables as the + # body. + subject = "Docspell {{ latestVersion }} is available" + + # The body of the mail. Subject and body can contain these + # variables which are replaced: + # + # - `latestVersion` the latest available version of Docspell + # - `currentVersion` the currently running (old) version of Docspell + # - `releasedAt` a date when the release was published + # + # The body is processed as markdown after the variables have been + # replaced. + body = """ +Hello, + +You are currently running Docspell {{ currentVersion }}. Version *{{ latestVersion }}* +is now available, which was released on {{ releasedAt }}. Check the release page at: + + + +Have a nice day! + +Docpell Update Check +""" + } + + # Configuration of text extraction + extraction { + # For PDF files it is first tried to read the text parts of the + # PDF. But PDFs can be complex documents and they may contain text + # and images. If the returned text is shorter than the value + # below, OCR is run afterwards. Then both extracted texts are + # compared and the longer will be used. + # + # If you set this to 0 (or a negative value), then the text parts + # of a PDF are ignored and OCR is always run and its result used. + pdf { + min-text-len = 500 + } + + preview { + # When rendering a pdf page, use this dpi. This results in + # scaling the image. A standard A4 page rendered at 96dpi + # results in roughly 790x1100px image. Using 32 results in + # roughly 200x300px image. + # + # Note, when this is changed, you might want to re-generate + # preview images. Check the api for this, there is an endpoint + # to regenerate all for a collective. + dpi = 32 + } + + # Extracting text using OCR works for image and pdf files. It will + # first run ghostscript to create a gray image from a pdf. Then + # unpaper is run to optimize the image for the upcoming ocr, which + # will be done by tesseract. All these programs must be available + # in your PATH or the absolute path can be specified below. + ocr { + + # Images greater than this size are skipped. Note that every + # image is loaded completely into memory for doing OCR. This is + # the pixel count, `height * width` of the image. + max-image-size = 28000000 + + # Defines what pages to process. If a PDF with 600 pages is + # submitted, it is probably not necessary to scan through all of + # them. This would take a long time and occupy resources for no + # value. The first few pages should suffice. The default is first + # 10 pages. + # + # If you want all pages being processed, set this number to -1. + # + # Note: if you change the ghostscript command below, be aware that + # this setting (if not -1) will add another parameter to the + # beginning of the command. + page-range { + begin = 10 + } + + # The ghostscript command. + ghostscript { + command { + program = "gs" + args = [ "-dNOPAUSE" + , "-dBATCH" + , "-dSAFER" + , "-sDEVICE=tiffscaled8" + , "-sOutputFile={{outfile}}" + , "{{infile}}" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-extraction" + } + + # The unpaper command. + unpaper { + command { + program = "unpaper" + args = [ "{{infile}}", "{{outfile}}" ] + timeout = "5 minutes" + } + } + + # The tesseract command. + tesseract { + command { + program = "tesseract" + args = ["{{file}}" + , "stdout" + , "-l" + , "{{lang}}" + ] + timeout = "5 minutes" + } + } + } + } + + # Settings for text analysis + text-analysis { + # Maximum length of text to be analysed. + # + # All text to analyse must fit into RAM. A large document may take + # too much heap. Also, most important information is at the + # beginning of a document, so in most cases the first two pages + # should suffice. Default is 5000, which are about 2 pages (just a + # rough guess, of course). For my data, more than 80% of the + # documents are less than 5000 characters. + # + # This values applies to nlp and the classifier. If this value is + # <= 0, the limit is disabled. + max-length = 0 + + # A working directory for the analyser to store temporary/working + # files. + working-dir = ${java.io.tmpdir}"/docspell-analysis" + + nlp { + # The mode for configuring NLP models: + # + # 1. full – builds the complete pipeline + # 2. basic - builds only the ner annotator + # 3. regexonly - matches each entry in your address book via regexps + # 4. disabled - doesn't use any stanford-nlp feature + # + # The full and basic variants rely on pre-build language models + # that are available for only a few languages. Memory usage + # varies among the languages. So joex should run with -Xmx1400M + # at least when using mode=full. + # + # The basic variant does a quite good job for German and + # English. It might be worse for French, always depending on the + # type of text that is analysed. Joex should run with about 500M + # heap, here again lanugage German uses the most. + # + # The regexonly variant doesn't depend on a language. It roughly + # works by converting all entries in your addressbook into + # regexps and matches each one against the text. This can get + # memory intensive, too, when the addressbook grows large. This + # is included in the full and basic by default, but can be used + # independently by setting mode=regexner. + # + # When mode=disabled, then the whole nlp pipeline is disabled, + # and you won't get any suggestions. Only what the classifier + # returns (if enabled). + mode = full + + # The StanfordCoreNLP library caches language models which + # requires quite some amount of memory. Setting this interval to a + # positive duration, the cache is cleared after this amount of + # idle time. Set it to 0 to disable it if you have enough memory, + # processing will be faster. + # + # This has only any effect, if mode != disabled. + clear-interval = "15 minutes" + + # Restricts proposals for due dates. Only dates earlier than this + # number of years in the future are considered. + max-due-date-years = 10 + + regex-ner { + # Whether to enable custom NER annotation. This uses the + # address book of a collective as input for NER tagging (to + # automatically find correspondent and concerned entities). If + # the address book is large, this can be quite memory + # intensive and also makes text analysis much slower. But it + # improves accuracy and can be used independent of the + # lanugage. If this is set to 0, it is effectively disabled + # and NER tagging uses only statistical models (that also work + # quite well, but are restricted to the languages mentioned + # above). + # + # Note, this is only relevant if nlp-config.mode is not + # "disabled". + max-entries = 1000 + + # The NER annotation uses a file of patterns that is derived + # from a collective's address book. This is is the time how + # long this data will be kept until a check for a state change + # is done. + file-cache-time = "1 minute" + } + } + + # Settings for doing document classification. + # + # This works by learning from existing documents. This requires a + # satstical model that is computed from all existing documents. + # This process is run periodically as configured by the + # collective. It may require more memory, depending on the amount + # of data. + # + # It utilises this NLP library: https://nlp.stanford.edu/. + classification { + # Whether to enable classification globally. Each collective can + # enable/disable auto-tagging. The classifier is also used for + # finding correspondents and concerned entities, if enabled + # here. + enabled = true + + # If concerned with memory consumption, this restricts the + # number of items to consider. More are better for training. A + # negative value or zero means to train on all items. + # + # This limit and `text-analysis.max-length` define how much + # memory is required. On weaker hardware, it is advised to play + # with these values. + item-count = 600 + + # These settings are used to configure the classifier. If + # multiple are given, they are all tried and the "best" is + # chosen at the end. See + # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html + # for more info about these settings. The settings here yielded + # good results with *my* dataset. + # + # Enclose regexps in triple quotes. + classifiers = [ + { "useSplitWords" = "true" + "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.""" + "splitWordsIgnoreRegexp" = """\s+""" + "useSplitPrefixSuffixNGrams" = "true" + "maxNGramLeng" = "4" + "minNGramLeng" = "1" + "splitWordShape" = "chris4" + "intern" = "true" # makes it slower but saves memory + } + ] + } + } + + # Configuration for converting files into PDFs. + # + # Most of it is delegated to external tools, which can be configured + # below. They must be in the PATH environment or specify the full + # path below via the `program` key. + convert { + + # The chunk size used when storing files. This should be the same + # as used with the rest server. + chunk-size = 2097152 +#${docspell.joex.files.chunk-size} + + # A string used to change the filename of the converted pdf file. + # If empty, the original file name is used for the pdf file ( the + # extension is always replaced with `pdf`). + converted-filename-part = "converted" + + # When reading images, this is the maximum size. Images that are + # larger are not processed. + max-image-size = ${docspell.joex.extraction.ocr.max-image-size} + + # Settings when processing markdown files (and other text files) + # to HTML. + # + # In order to support text formats, text files are first converted + # to HTML using a markdown processor. The resulting HTML is then + # converted to a PDF file. + markdown { + + # The CSS that is used to style the resulting HTML. + internal-css = """ + body { padding: 2em 5em; } + """ + } + + # Which HTML->PDF converter command to use. One of: wkhtmlpdf, + # weasyprint. + html-converter = "wkhtmlpdf" + + # To convert HTML files into PDF files, the external tool + # wkhtmltopdf is used. + wkhtmlpdf { + command = { + program = "wkhtmltopdf" + args = [ + "-s", + "A4", + "--encoding", + "{{encoding}}", + "--load-error-handling", "ignore", + "--load-media-error-handling", "ignore", + "-", + "{{outfile}}" + ] + timeout = "10 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf" + } + + # An alternative to wkhtmltopdf is weasyprint. + weasyprint { + command = { + program = "weasyprint" + args = [ + "--optimize-size", "all", + "--encoding", "{{encoding}}", + "-", + "{{outfile}}" + ] + timeout = "10 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-weasyprint" + } + + # To convert image files to PDF files, tesseract is used. This + # also extracts the text in one go. + tesseract = { + command = { + program = "tesseract" + args = [ + "{{infile}}", + "out", + "-l", + "{{lang}}", + "pdf", + "txt" + ] + timeout = "10 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + # To convert "office" files to PDF files, the external tool + # unoconv is used. Unoconv uses libreoffice/openoffice for + # converting. So it supports all formats that are possible to read + # with libreoffice/openoffic. + # + # Note: to greatly improve performance, it is recommended to start + # a libreoffice listener by running `unoconv -l` in a separate + # process. + unoconv = { + command = { + program = "unoconv" + args = [ + "-f", + "pdf", + "-o", + "{{outfile}}", + "{{infile}}" + ] + timeout = "10 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + # The tool ocrmypdf can be used to convert pdf files to pdf files + # in order to add extracted text as a separate layer. This makes + # image-only pdfs searchable and you can select and copy/paste the + # text. It also converts pdfs into pdf/a type pdfs, which are best + # suited for archiving. So it makes sense to use this even for + # text-only pdfs. + # + # It is recommended to install ocrympdf, but it also is optional. + # If it is enabled but fails, the error is not fatal and the + # processing will continue using the original pdf for extracting + # text. You can also disable it to remove the errors from the + # processing logs. + # + # The `--skip-text` option is necessary to not fail on "text" pdfs + # (where ocr is not necessary). In this case, the pdf will be + # converted to PDF/A. + ocrmypdf = { + enabled = true + command = { + program = "ocrmypdf" + args = [ + "-l", "{{lang}}", + "--skip-text", + "--deskew", + "-j", "1", + "{{infile}}", + "{{outfile}}" + ] + timeout = "10 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + # Allows to try to decrypt a PDF with encryption or protection. If + # enabled, a PDFs encryption or protection will be removed during + # conversion. + # + # For encrypted PDFs, this is necessary to be processed, because + # docspell needs to read it. It also requires to specify a + # password here. All passwords are tried when reading a PDF. + # + # This is enabled by default with an empty password list. This + # removes protection from PDFs, which is better for processing. + # + # Passwords can be given here and each collective can maintain + # their passwords as well. But if the `enabled` setting below is + # `false`, then no attempt at decrypting is done. + decrypt-pdf = { + enabled = true + passwords = [] + } + } + + # The same section is also present in the rest-server config. It is + # used when submitting files into the job queue for processing. + # + # Currently, these settings may affect memory usage of all nodes, so + # it should be the same on all nodes. + files { + # Defines the chunk size (in bytes) used to store the files. + # This will affect the memory footprint when uploading and + # downloading files. At most this amount is loaded into RAM for + # down- and uploading. + # + # It also defines the chunk size used for the blobs inside the + # database. + chunk-size = 524288 + + # The file content types that are considered valid. Docspell + # will only pass these files to processing. The processing code + # itself has also checks for which files are supported and which + # not. This affects the uploading part and can be used to + # restrict file types that should be handed over to processing. + # By default all files are allowed. + valid-mime-types = [ ] + + # The id of an enabled store from the `stores` array that should + # be used. + # + # IMPORTANT NOTE: All nodes must have the exact same file store + # configuration! + default-store = "database" + + # A list of possible file stores. Each entry must have a unique + # id. The `type` is one of: default-database, filesystem, s3. + # + # The enabled property serves currently to define target stores + # for te "copy files" task. All stores with enabled=false are + # removed from the list. The `default-store` must be enabled. + stores = { + database = + { enabled = true + type = "default-database" + } + + filesystem = + { enabled = false + type = "file-system" + directory = "/some/directory" + } + + minio = + { enabled = false + type = "s3" + endpoint = "http://localhost:9000" + access-key = "username" + secret-key = "password" + bucket = "docspell" + } + } + } + + # Configuration of the full-text search engine. (the same must be used for restserver) + full-text-search { + # The full-text search feature can be disabled. It requires an + # additional index server which needs additional memory and disk + # space. It can be enabled later any time. + # + # Currently the SOLR search platform and PostgreSQL is supported. + enabled = false + + # Which backend to use, either solr or postgresql + backend = "solr" + + # Configuration for the SOLR backend. + solr = { + # The URL to solr + url = "http://localhost:8983/solr/docspell" + # Used to tell solr when to commit the data + commit-within = 1000 + # If true, logs request and response bodies + log-verbose = false + # The defType parameter to lucene that defines the parser to + # use. You might want to try "edismax" or look here: + # https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + def-type = "lucene" + # The default combiner for tokens. One of {AND, OR}. + q-op = "OR" + } + + # Configuration for PostgreSQL backend + postgresql = { + # Whether to use the default database, only works if it is + # postgresql + use-default-connection = false + + # The database connection. + jdbc { + url = "jdbc:postgresql://db:5432/dbname" + user = "dbuser" + password = "dbpass" + } + + # A mapping from a language to a postgres text search config. By + # default a language is mapped to a predefined config. + # PostgreSQL has predefined configs for some languages. This + # setting allows to create a custom text search config and + # define it here for some or all languages. + # + # Example: + # { german = "my-german" } + # + # See https://www.postgresql.org/docs/14/textsearch-tables.html ff. + pg-config = { + } + + # Define which query parser to use. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES + pg-query-parser = "websearch_to_tsquery" + + # Allows to define a normalization for the ranking. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING + pg-rank-normalization = [ 4 ] + } + + # Settings for running the index migration tasks + migration = { + # Chunk size to use when indexing data from the database. This + # many attachments are loaded into memory and pushed to the + # full-text index. + index-all-chunk = 10 + } + } + + addons { + # A directory to extract addons when running them. Everything in + # here will be cleared after each run. + working-dir = ${java.io.tmpdir}"/docspell-addons" + + # A directory for addons to store data between runs. This is not + # cleared by Docspell and can get large depending on the addons + # executed. + # + # This directory is used as base. In it subdirectories are created + # per run configuration id. + cache-dir = ${java.io.tmpdir}"/docspell-addon-cache" + + executor-config { + # Define a (comma or whitespace separated) list of runners that + # are responsible for executing an addon. This setting is + # compared to what is supported by addons. Possible values are: + # + # - nix-flake: use nix-flake runner if the addon supports it + # (this requires the nix package manager on the joex machine) + # - docker: use docker + # - trivial: use the trivial runner + # + # The first successful execution is used. This should list all + # runners the computer supports. + runner = "nix-flake, docker, trivial" + + # systemd-nspawn can be used to run the program in a container. + # This is used by runners nix-flake and trivial. + nspawn = { + # If this is false, systemd-nspawn is not tried. When true, the + # addon is executed inside a lightweight container via + # systemd-nspawn. + enabled = false + + # Path to sudo command. By default systemd-nspawn is executed + # via sudo - the user running joex must be allowed to do so NON + # INTERACTIVELY. If this is empty, then nspawn is tried to + # execute without sudo. + sudo-binary = "sudo" + + # Path to the systemd-nspawn command. + nspawn-binary = "systemd-nspawn" + + # Workaround, if multiple same named containers are run too fast + container-wait = "100 millis" + } + + # When multiple addons are executed sequentially, stop after the + # first failing result. If this is false, then subsequent addons + # will be run for their side effects only. + fail-fast = true + + # The timeout for running an addon. + run-timeout = "15 minutes" + + # Configure the nix flake runner. + nix-runner { + # Path to the nix command. + nix-binary = "nix" + + # The timeout for building the package (running nix build). + build-timeout = "15 minutes" + } + + # Configure the docker runner + docker-runner { + # Path to the docker command. + docker-binary = "docker" + + # The timeout for building the package (running docker build). + build-timeout = "15 minutes" + } + } + } +} + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..bd19438 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,147 @@ +version: '3.8' +services: + restserver: + image: docspell/restserver:latest + container_name: docspell-restserver + restart: unless-stopped + ports: + - "7880:7880" + environment: + - TZ=Asia/Jerusalem + - DOCSPELL_SERVER_INTERNAL__URL=http://docspell-restserver:7880 + - DOCSPELL_SERVER_ADMIN__ENDPOINT_SECRET=admin123 + - DOCSPELL_SERVER_AUTH_SERVER__SECRET= + - DOCSPELL_SERVER_BACKEND_JDBC_PASSWORD=dbpass + - DOCSPELL_SERVER_BACKEND_JDBC_URL=jdbc:postgresql://db:5432/dbname + - DOCSPELL_SERVER_BACKEND_JDBC_USER=dbuser + - DOCSPELL_SERVER_BIND_ADDRESS=0.0.0.0 + - DOCSPELL_SERVER_FULL__TEXT__SEARCH_ENABLED=true + - DOCSPELL_SERVER_FULL__TEXT__SEARCH_SOLR_URL=http://docspell-solr:8983/solr/docspell + - DOCSPELL_SERVER_INTEGRATION__ENDPOINT_ENABLED=true + - DOCSPELL_SERVER_INTEGRATION__ENDPOINT_HTTP__HEADER_ENABLED=true + - DOCSPELL_SERVER_INTEGRATION__ENDPOINT_HTTP__HEADER_HEADER__VALUE=integration-password123 + - DOCSPELL_SERVER_BACKEND_SIGNUP_MODE=open + - DOCSPELL_SERVER_BACKEND_SIGNUP_NEW__INVITE__PASSWORD= + - DOCSPELL_SERVER_BACKEND_ADDONS_ENABLED=false + depends_on: + - solr + - db + networks: + - network + - internal + labels: + - "traefik.enable=true" + - "traefik.docker.network=docspell_network" + - "traefik.http.routers.docspell.entrypoints=websecure" + - "traefik.http.services.docspell.loadbalancer.server.port=7880" + - "traefik.http.routers.docspell.rule=Host(`docs.pukeko.xyz`)" + - "traefik.http.routers.docspell.tls.certresolver=pukekoresolver" + # - "traefik.http.routers.docspell.middlewares=authelia@docker" + volumes: + - ./config/docspell.conf:/opt/docspell.conf + command: + - /opt/docspell.conf + + + joex: + image: docspell/joex:latest + container_name: docspell-joex + ## For more memory add corresponding arguments, like below. Also see + ## https://docspell.org/docs/configure/#jvm-options + # command: + # - -J-Xmx3G + restart: unless-stopped + environment: + - TZ=Asia/Jerusalem + - DOCSPELL_JOEX_APP__ID=joex1 + - DOCSPELL_JOEX_PERIODIC__SCHEDULER_NAME=joex1 + - DOCSPELL_JOEX_SCHEDULER_NAME=joex1 + - DOCSPELL_JOEX_BASE__URL=http://docspell-joex:7878 + - DOCSPELL_JOEX_BIND_ADDRESS=0.0.0.0 + - DOCSPELL_JOEX_FULL__TEXT__SEARCH_ENABLED=true + - DOCSPELL_JOEX_FULL__TEXT__SEARCH_SOLR_URL=http://docspell-solr:8983/solr/docspell + - DOCSPELL_JOEX_JDBC_PASSWORD=dbpass + - DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname + - DOCSPELL_JOEX_JDBC_USER=dbuser + - DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial + - DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint + ports: + - "8880:7878" + depends_on: + - solr + - db + networks: + - internal + volumes: + - ./config/docspell.conf:/opt/docspell.conf + ## Uncomment when using the "docker" runner with addons + # volumes: + # - /var/run/docker.sock:/var/run/docker.sock + # - /tmp:/tmp + + # The consumedir container watches a directory for files to upload + # to docspell restserver. This uses the `dsc` tool. For information + # on the available options, see `dsc --help`. + # https://github.com/docspell/dsc + # + # The value after `Docspell-Integration` must match the secret + # specified at the restserver via + # DOCSPELL_SERVER_INTEGRATION__ENDPOINT_HTTP__HEADER_HEADER__VALUE. + # + # The --not-matches "**/.*" option is to exclude hidden files. + consumedir: + image: docspell/dsc:latest + container_name: docspell-consumedir + command: + - dsc + - "-d" + - "http://docspell-restserver:7880" + - "watch" + - "--delete" + - "-ir" + - "--not-matches" + - "**/.*" + - "--header" + - "Docspell-Integration:integration-password123" + - "/opt/docs" + restart: unless-stopped + volumes: + - ./docs:/opt/docs + depends_on: + - restserver + + db: + image: postgres:15.2 + container_name: postgres_db + restart: unless-stopped + volumes: + - ./docspell-postgres_data:/var/lib/postgresql/data/ + environment: + - POSTGRES_USER=dbuser + - POSTGRES_PASSWORD=dbpass + - POSTGRES_DB=dbname + networks: + - internal + + solr: + image: solr:9 + container_name: docspell-solr + restart: unless-stopped + volumes: + - ./docspell-solr_data:/var/solr + command: + - solr-precreate + - docspell + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"] + interval: 1m + timeout: 10s + retries: 2 + start_period: 30s + networks: + - internal +networks: + network: + driver: bridge + internal: + driver: bridge