@prefix this: . @prefix sub: . @prefix np: . @prefix dct: . @prefix pav: . @prefix xsd: . @prefix rdfs: . @prefix prov: . @prefix npx: . sub:Head { this: np:hasAssertion sub:assertion; np:hasProvenance sub:provenance; np:hasPublicationInfo sub:pubinfo; a np:Nanopublication . } sub:assertion { "WorkflowHub"; "https://about.workflowhub.eu/"; a . "Workflow RO-Crate Profile"; "0.2.0"; a . "Yasmmin Martins"; a . "MaƂgorzata Wolniewicz"; a . "https://doi.org/10.1093/bioinformatics/bts480"; "Snakemake"; "https://snakemake.readthedocs.io/"; a . "organism folders directory"; a ; "1.9624945486262537"; "4.5" . "training ppi augmentation"; a ; "7.283035324901875"; "16.7" . "environment"; a ; "2.3600185099490973"; "5.1" . "Then it proceeds parsing and cleaning the HPIDB results and downloading the protein interactions of the found organisms from the STRING database."; a ; "11.815561959654177"; "12.3" . "Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions"; a ; "18.443804034582133"; "19.2" . "http"; a ; "3.0078667283664973"; "6.5" . "host"; a ; "3.6437246963562755"; "13.5" . "step"; a ; "12.216566404442387"; "26.4" . "law"; a ; "1.7699115044247788"; "1.4" . "data"; a ; "6.4507422402159245"; "23.9" . "rete"; a ; "4.8582995951417"; "18.0" . """## Resources - [How to Contribute to Open Source](https://opensource.guide/how-to-contribute/) - [Using Pull Requests](https://help.github.com/articles/about-pull-requests/) - [GitHub Help](https://help.github.com)"""; a ; "12.199807877041305"; "12.7" . "protein"; a ; "11.470985155195681"; "42.5" . """## Contributing [fork]: /fork [pr]: /compare [style]: https://standardjs.com/ [code-of-conduct]: CODE_OF_CONDUCT.md"""; a ; "9.414024975984631"; "9.8" . "preparation instruction"; a ; "1.439162668992586"; "3.3" . "data acquisition step"; a ; "1.9188835586567816"; "4.4" . "pull"; a ; "2.822767237390097"; "6.1" . "interaction"; a ; "8.367071524966262"; "31.0" . "Genetics"; a ; "Science and technology/Natural science/Biology/Genetics" . "pathogen"; a ; "4.993252361673414"; "18.5" . "environment"; a ; "1.9163292847503375"; "7.1" . "Genetics"; a ; "Science and technology/Natural science/Biology/Genetics" . "training"; a ; "4.257288292457195"; "9.2" . "work in progress pull request"; a ; "18.796336676842564"; "43.1" . "visualization step"; a ; "2.0497165285651984"; "4.7" . "IT-computer sciences"; a ; "Science and technology/Technology and engineering/IT-computer sciences" . "query protein ID"; a ; "3.6633231574356735"; "8.4" . "using pull request"; a ; "4.361098996947231"; "10.0" . "pull request 1"; a ; "4.53554295682512"; "10.4" . "oceanography"; a ; "100.0"; "1.8297315537929535" . "software"; a ; "22.629582806573957"; "17.9" . "string protein identifier"; a ; "3.445268207588312"; "7.8999999999999995" . "life sciences"; a ; "83.09801429584705"; "1.9117830991744995" . "Work in Progress pull requests are also welcome to get feedback early on, or if there is something blocked you."; a ; "8.549471661863592"; "8.9" . "host"; a ; "6.247107820453494"; "13.5" . "npm test"; a ; "1.70082860880942"; "3.9" . "data"; a ; "7.265155020823693"; "15.7" . "folder"; a ; "2.4831309041835357"; "9.2" . "IT-computer sciences"; a ; "Science and technology/Technology and engineering/IT-computer sciences" . "computer science"; a ; "40.07585335018964"; "31.700000000000003" . "computer programming"; a ; "17.825537294563844"; "14.1" . "step"; a ; "6.828609986504723"; "25.3" . "target"; a ; "2.2132253711201075"; "8.2" . "database"; a ; "11.630847029077117"; "9.2" . "identifier"; a ; "5.414160111059695"; "11.7" . "interaction data"; a ; "0.9158307893589184"; "2.1" . "http"; a ; "2.294197031039136"; "8.5" . "2023"; a . "Public relations"; a ; "Economy, business and finance/Economic sector/Media/Public relations" . "host-pathogen protein-protein interaction"; a ; "8.242477104230264"; "18.9" . "system"; a ; "2.42914979757085"; "9.0" . "earth sciences"; a ; "100.0"; "1.8297315537929535" . "computer programming and software"; a ; "16.901985704152953"; "0.3888532221317291" . "method"; a ; "1.8509949097639982"; "4.0" . "Health"; a ; "Health" . "information technology"; a ; "4.677623261694058"; "3.7" . "acquisition step"; a ; "13.170518970780636"; "30.2" . "Diseases and conditions"; a ; "Health/Diseases and conditions" . "update test"; a ; "3.270824247710423"; "7.5" . "home"; a ; "4.211013419713096"; "9.1" . "organism"; a ; "4.561403508771929"; "16.9" . "Food"; a ; "Economy, business and finance/Economic sector/Consumer goods/Food" . "pathogen"; a ; "5.460434983803795"; "11.8" . "evaluation visualization step"; a ; "10.248582642825992"; "23.5" . "pull"; a ; "2.2672064777327936"; "8.4" . "Research Object Crate for HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions."; a ; "19.788664745437078"; "20.6" . "instruction"; a ; "1.897269782508098"; "4.1" . "identifier"; a ; "4.804318488529015"; "17.8" . "life sciences (general)"; a ; "83.09801429584705"; "1.9117830991744995" . "host proteins information"; a ; "2.878325337985172"; "6.6" . "request"; a ; "6.802406293382694"; "14.7" . "This option is extremely useful when you are not sure that your organism has validated protein interactions, then it finds validated interactions from the closest proteins in the database."; a ; "19.788664745437078"; "20.6" . "pathogen network"; a ; "8.068033144352377"; "18.5" . "interaction"; a ; "6.756131420638593"; "14.6" . "addestramento"; a ; "2.4021592442645074"; "8.9" . pav:importedBy ; ; "73915655"^^xsd:integer; "https://api.rohub.org/api/ros/dd5c3d62-b632-46a1-99e4-761f2e6cb60d/crate/download/"; "Stable"; ; "2023-10-20 08:40:49.625147+00:00"; "2024-03-05 12:23:14.770896+00:00"; "2023-10-20 08:40:49.625147+00:00"; """## Summary HPPIDiscovery is a scientific workflow to augment, predict and perform an insilico curation of host-pathogen Protein-Protein Interactions (PPIs) using graph theory to build new candidate ppis and machine learning to predict and evaluate them by combining multiple PPI detection methods of proteins according to three categories: structural, based on primary aminoacid sequence and functional annotations.
HPPIDiscovery contains three main steps: (i) acquirement of pathogen and host proteins information from seed ppis provided by HPIDB search methods, (ii) Model training and generation of new candidate ppis from HPIDB seed proteins' partners, and (iii) Evaluation of new candidate ppis and results exportation. (i) The first step acquires the identification of the taxonomy ids of the host and pathogen organisms in the result files. Then it proceeds parsing and cleaning the HPIDB results and downloading the protein interactions of the found organisms from the STRING database. The string protein identifiers are also mapped using the id mapping tool of uniprot API and we retrieve the uniprot entry ids along with the functional annotations, sequence, domain and kegg enzymes. (ii) The second step builds the training dataset using the non redundant hpidb validated interactions of each genome as positive set and random string low confidence ppis from each genome as negative set. Then, PredPrin tool is executed in the training mode to obtain the model that will evaluate the new candidate PPIs. The new ppis are then generated by performing a pairwise combination of string partners of host and pathogen hpidb proteins. Finally, (iii) in the third step, the predprin tool is used in the test mode to evaluate the new ppis and generate the reports and list of positively predicted ppis. The figure below illustrates the steps of this workflow. ## Requirements: * Edit the configuration file (config.yaml) according to your own data, filling out the following fields: - base_data: location of the organism folders directory, example: /home/user/data/genomes - parameters_file: Since this workflow may perform parallel processing of multiple organisms at the same time, you must prepate a tabulated file containng the genome folder names located in base data, where the hpidb files are located. Example: /home/user/data/params.tsv. It must have the following columns: genome (folder name), hpidb_seed_network (the result exported by one of the search methods available in hpidb database), hpidb_search_method (the type of search used to generate the results) and target_taxon (the target taxon id). The column hpidb_source may have two values: keyword or homology. In the keyword mode, you provide a taxonomy, protein name, publication id or detection method and you save all results (mitab.zip) in the genome folder. Finally, in the homology mode allows the user to search for host pathogen ppis giving as input fasta sequences of a set of proteins of the target pathgen for enrichment (so you have to select the search for a pathogen set) and you save the zip folder results (interaction data) in the genome folder. This option is extremely useful when you are not sure that your organism has validated protein interactions, then it finds validated interactions from the closest proteins in the database. In case of using the homology mode, the identifiers of the pathogens' query fasta sequences must be a Uniprot ID. All the query protein IDs must belong to the same target organism (taxon id). - model_file: path of a previously trained model in joblib format (if you want to train from the known validated PPIs given as seeds, just put a 'None' value) ## Usage Instructions The steps below consider the creation of a sqlite database file with all he tasks events which can be used after to retrieve the execution time taken by the tasks. It is possible run locally too (see luigi's documentation to change the running command).

* Preparation: 1. ````git clone https://github.com/YasCoMa/hppidiscovery.git```` 2. ````cd hppidiscovery```` 3. ````mkdir luigi_log```` 4. ````luigid --background --logdir luigi_log```` (start luigi server) 5. conda env create -f hp_ppi_augmentation.yml 6. conda activate hp_ppi_augmentation 6.1. (execute ````pip3 install wget```` (it is not installed in the environment)) 7. run ````pwd```` command and get the full path 8. Substitute in config_example.yaml with the full path obtained in the previous step 9. Download SPRINT pre-computed similarities in https://www.csd.uwo.ca/~ilie/SPRINT/precomputed_similarities.zip and unzip it inside workflow_hpAugmentation/predprin/core/sprint/HSP/ 10. ````cd workflow_hpAugmentation/predprin/```` 11. Uncompress annotation_data.zip 12. Uncompress sequence_data.zip 13. ````cd ../../```` 14. ````cd workflow_hpAugmentation```` 15. snake -n (check the plan of jobs, it should return no errors and exceptions) 16. snakemake -j 4 (change this number according the number of genomes to analyse and the amount of cores available in your machine)""", ; "application/ld+json"; , , , , , , , , , ; "https://w3id.org/ro-id/dd5c3d62-b632-46a1-99e4-761f2e6cb60d"; "https://github.com/YasCoMa/hppidiscovery"; ; "workflow_hpAugmentation/Snakefile"; "Research Object Crate for HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions"; "https://workflowhub.eu/workflows/611/ro_crate?version=1"; a , , ; "https://w3id.org/ro-id/111af928-f686-4bda-924b-4d6407e6feed", "https://w3id.org/ro-id/583f870d-0ee2-48d1-9e61-e09c5ffe6a32", "https://w3id.org/ro-id/6c5513c3-a907-4281-b555-760e558bc960", "https://w3id.org/ro-id/6cc6fd78-3a84-4d03-be24-b5f8e451b21a", "https://w3id.org/ro-id/77ae93dc-b89d-4abf-b735-37a67bef9da4", "https://w3id.org/ro-id/9ae27d40-6996-4ad9-94a3-515538c22c79", "https://w3id.org/ro-id/f2e728b8-5c9c-4555-9232-0131092b6272"; "https://w3id.org/ro-id/0f4c93e8-91a3-4fff-962c-8bb79138b6fd", "https://w3id.org/ro-id/11614cdb-1e09-45c2-a980-f2949861e022", "https://w3id.org/ro-id/14ffbeea-0136-4cd9-aab2-b972b0af8aa7", "https://w3id.org/ro-id/1b4a9ef4-0ce2-4517-88ab-36c39b794934", "https://w3id.org/ro-id/29fe8e97-4b37-4b75-a967-c2cdc0a0bbd7", "https://w3id.org/ro-id/33d9f4f0-0e3f-4797-89a2-2267f00f7297", "https://w3id.org/ro-id/37380120-5d64-4364-ab64-5a8a366db1a8", "https://w3id.org/ro-id/6a8ba727-fe18-4cf9-901b-cd475d144ab5", "https://w3id.org/ro-id/6f2c1981-b3c6-4b9e-b1fa-7ec9d3f190fe", "https://w3id.org/ro-id/761ad95e-74d8-42b1-a361-a7a10b56e77a", "https://w3id.org/ro-id/80ef9426-65fd-472a-aa77-d0cad8fde53a", "https://w3id.org/ro-id/8c13e0ba-4ad1-47dd-9c35-6a7698c704fb", "https://w3id.org/ro-id/a0ba2136-f117-412d-affb-a594d0f0367a", "https://w3id.org/ro-id/a76aba8c-99c2-402d-984c-a82439a65f6d", "https://w3id.org/ro-id/bf2c5ac9-fc47-445e-b1ed-b9635968a414", "https://w3id.org/ro-id/dc2458ae-e759-4a66-8b66-ec7cba0ae721", "https://w3id.org/ro-id/e0d9119f-b919-46c8-bdb1-bffc34db68e5", "https://w3id.org/ro-id/ee1792d5-39ea-478e-ae8f-be4e70fbee5e", "https://w3id.org/ro-id/efe7db16-7b5f-438a-b506-6cab1eebd39e"; "https://w3id.org/ro-id/56e8cc9e-8d06-4f92-9f37-67de3065d53b", "https://w3id.org/ro-id/8efd5124-a3c9-42d7-a7d4-d297958efc38"; "https://w3id.org/ro-id/2b843c5e-081f-49dc-9da5-0be7fb9e068d", "https://w3id.org/ro-id/383c0311-2cbb-4eef-97bf-a2a84a394ec8", "https://w3id.org/ro-id/489cf36b-1781-405c-a95c-1860eb4f28c3", "https://w3id.org/ro-id/6bea3bdc-97ec-430a-a6f4-d1c71b6166bd", "https://w3id.org/ro-id/846568c0-81be-46a9-9556-b3b5e2292abd", "https://w3id.org/ro-id/9355543e-8e6b-4810-9ddb-86142bb61e1a", "https://w3id.org/ro-id/9c611509-48ea-4cba-b0ca-594c08da12b8", "https://w3id.org/ro-id/a3068f62-21f1-42b2-a8ea-89b52afb5b4d", "https://w3id.org/ro-id/eb7e7a24-4da5-4831-8bd4-0a7275d926d8", "https://w3id.org/ro-id/ed30e4ff-cd62-4153-a39a-71b4921e14bb"; "https://w3id.org/ro-id/08b000c6-af1c-47b8-ba82-5954c04d297a", "https://w3id.org/ro-id/0dd7faa8-1a2d-40e0-abe2-96240507fa99", "https://w3id.org/ro-id/0f7be7e1-67c9-4f86-a4e3-377e9814ac87", "https://w3id.org/ro-id/28a2826c-83a2-4912-87d5-5b06809d55e3", "https://w3id.org/ro-id/3f5d31f1-5fbb-43c7-90ff-bf8741fbd679", "https://w3id.org/ro-id/5ee960b5-fc3f-4176-a479-05561cb2c567", "https://w3id.org/ro-id/6a79e54e-4aba-44fd-ae8e-961460eeae62", "https://w3id.org/ro-id/79471dea-aa76-4ff0-bc4a-742f94d355f3", "https://w3id.org/ro-id/924014a8-80ef-4a3d-b561-706c52094de3", "https://w3id.org/ro-id/9d3e75e3-947d-4512-b0a4-5374946bff18", "https://w3id.org/ro-id/a4aaa62c-5ddd-4cbc-970f-7171b805d093", "https://w3id.org/ro-id/b00941e9-c13e-490a-9c62-8e16686a0643", "https://w3id.org/ro-id/cceecac1-b5ac-4102-bd0a-b351a1f10570", "https://w3id.org/ro-id/d5124545-85a5-4d48-b2a4-c5a5e13fd7a5", "https://w3id.org/ro-id/e3495957-27d9-4397-a413-9817a50b9b46", "https://w3id.org/ro-id/eeb62c93-f773-4be0-b61c-b0f780bee084", "https://w3id.org/ro-id/f3327c85-d7a6-4af9-b92b-38a8edd6d3c4", "https://w3id.org/ro-id/f494add2-e4bb-40b1-9113-cc071ade8d77", "https://w3id.org/ro-id/f8050931-c4dc-48d0-85d7-489292466a09"; "https://w3id.org/ro-id/5a770683-edbb-466d-8126-3430e887a114", "https://w3id.org/ro-id/922c69e2-587b-4b7b-baad-74783984b07f", "https://w3id.org/ro-id/c0717ab2-4857-4304-bd64-cdc633d0534b", "https://w3id.org/ro-id/efc0610c-ffb1-4d06-8c61-e169dcd86d63"; "https://w3id.org/ro-id/0062773b-e965-4d7d-97d5-1ce1bc6c7808", "https://w3id.org/ro-id/05f8872e-477a-4d77-ac87-2fc8802af478", "https://w3id.org/ro-id/2404c310-981b-4ff7-b898-2404e3f6920f", "https://w3id.org/ro-id/25b7e0e8-a10a-440d-b3ec-f99ab8496cbf", "https://w3id.org/ro-id/46742160-8c62-48ef-a24d-50418dac0a11", "https://w3id.org/ro-id/47bcd546-feb2-43bb-891a-77dd851994eb", "https://w3id.org/ro-id/4f089e4d-ac64-47d0-b7b9-c114f675caf3", "https://w3id.org/ro-id/5271272f-075f-4a0e-ae9f-32e3d9ae9811", "https://w3id.org/ro-id/53fe5737-1ebd-4df6-ad43-0f2df330fc2a", "https://w3id.org/ro-id/593fb093-724b-46da-8db4-5e02795beb81", "https://w3id.org/ro-id/609b841e-2660-40ff-a156-609a2769d813", "https://w3id.org/ro-id/7f8cec9d-f890-4575-9d29-2ecab9dca5e8", "https://w3id.org/ro-id/8a640df1-03a0-49a2-9428-3f3618cdd827", "https://w3id.org/ro-id/9bc60d27-d928-4a67-b0b0-ae4d8619baa5", "https://w3id.org/ro-id/9d34e7a7-777c-4c91-9d69-e34ca957e5a1", "https://w3id.org/ro-id/a6908148-8ac2-4c03-b15d-261158b451ae", "https://w3id.org/ro-id/c407cb7e-f6f9-4f65-a59d-78e055bb4d02", "https://w3id.org/ro-id/d2b7168d-9c95-406e-8079-0cc1628375cb", "https://w3id.org/ro-id/e93a6358-be21-4e61-bfa3-c23eca08519c"; "https://w3id.org/ro-id/0920cb66-a159-4d81-aae8-4daa0987114d", "https://w3id.org/ro-id/097bd8b0-9a8b-470c-bd27-e2cd8d7a895a", "https://w3id.org/ro-id/1a95b293-a74b-4489-88f1-e9a78678cec7", "https://w3id.org/ro-id/1db93f3a-5732-4fb6-ac28-14ff2b55ccbb", "https://w3id.org/ro-id/5bfb5205-048a-404d-b9fc-b1115ce44391", "https://w3id.org/ro-id/a9edc4b8-9ae2-48e7-80ab-6bd1df7a68e9", "https://w3id.org/ro-id/ceb5652b-ae7b-4c51-9216-d21e3bc3a4fc"; "https://w3id.org/ro-id/81beeb9e-e49d-4bbb-bea1-bf21e168b422"; "Yasmmin Martins. \"Research Object Crate for HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions.\" ROHub. Oct 20 ,2023. https://w3id.org/ro-id/dd5c3d62-b632-46a1-99e4-761f2e6cb60d." . , , , , , , , , , , , , , , , , , , , , , , , , ; "workflow_hpAugmentation"; a , . , , , ; "log"; a , . ; "training_data"; a , . , , , , ; "input_example"; a , . ; ".snakemake"; a , . , , ; "__pycache__"; a , . ; "s_aureus"; a , . ; "3220"^^xsd:integer; "https://api.rohub.org/api/resources/04bdbfbd-b95b-4ec8-9b2b-36bfc698c52b/download/"; ; "2023-10-20 08:40:50.857721+00:00"; "2023-10-20 08:40:52.945411+00:00"; "text/markdown"; ; "CODE_OF_CONDUCT.md"; "2023-10-20 08:40:50.857721+00:00"; a , . ; "1055"^^xsd:integer; "https://api.rohub.org/api/resources/06814c66-d6f5-4d56-8024-1c8d2e58a4db/download/"; ; "2023-10-20 08:40:50.859933+00:00"; "2023-10-20 08:40:53.383379+00:00"; "text/markdown"; ; "LICENSE.md"; "2023-10-20 08:40:50.859933+00:00"; a , . ; "11917056"^^xsd:integer; "https://api.rohub.org/api/resources/074153e1-485d-47c3-8fd1-9b872f4d9bd3/download/"; ; "2023-10-20 08:40:50.969541+00:00"; "2023-10-20 08:40:54.972720+00:00"; "application/x-tar"; ; "s_aureus.tar.xz"; "2023-10-20 08:40:50.969541+00:00"; a , . ; "13959"^^xsd:integer; "https://api.rohub.org/api/resources/07e9fdac-98dc-4a1c-9785-d8d311108cde/download/"; ; "2023-10-20 08:40:50.986923+00:00"; "2023-10-20 08:41:00.215086+00:00"; "application/pdf"; ; "dag.pdf"; "2023-10-20 08:40:50.986923+00:00"; a , . ; "371"^^xsd:integer; "https://api.rohub.org/api/resources/12c2cb86-f0cb-4af4-9573-fe75e143f7e8/download/"; ; "2023-10-20 08:40:50.990475+00:00"; "2023-10-20 08:41:01.207300+00:00"; "text/x-python"; ; "get_metrics.py"; "2023-10-20 08:40:50.990475+00:00"; a , . dct:conformsTo "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/"; ; "10251"; "https://api.rohub.org/api/resources/13c69a83-de3f-4379-b137-6a12d45bf6e7/download/"; ; "2023-10-19 23:56:34+00:00"; "2023-10-20 08:41:12.795284+00:00"; """## Summary HPPIDiscovery is a scientific workflow to augment, predict and perform an insilico curation of host-pathogen Protein-Protein Interactions (PPIs) using graph theory to build new candidate ppis and machine learning to predict and evaluate them by combining multiple PPI detection methods of proteins according to three categories: structural, based on primary aminoacid sequence and functional annotations.
HPPIDiscovery contains three main steps: (i) acquirement of pathogen and host proteins information from seed ppis provided by HPIDB search methods, (ii) Model training and generation of new candidate ppis from HPIDB seed proteins' partners, and (iii) Evaluation of new candidate ppis and results exportation. (i) The first step acquires the identification of the taxonomy ids of the host and pathogen organisms in the result files. Then it proceeds parsing and cleaning the HPIDB results and downloading the protein interactions of the found organisms from the STRING database. The string protein identifiers are also mapped using the id mapping tool of uniprot API and we retrieve the uniprot entry ids along with the functional annotations, sequence, domain and kegg enzymes. (ii) The second step builds the training dataset using the non redundant hpidb validated interactions of each genome as positive set and random string low confidence ppis from each genome as negative set. Then, PredPrin tool is executed in the training mode to obtain the model that will evaluate the new candidate PPIs. The new ppis are then generated by performing a pairwise combination of string partners of host and pathogen hpidb proteins. Finally, (iii) in the third step, the predprin tool is used in the test mode to evaluate the new ppis and generate the reports and list of positively predicted ppis. The figure below illustrates the steps of this workflow. ## Requirements: * Edit the configuration file (config.yaml) according to your own data, filling out the following fields: - base_data: location of the organism folders directory, example: /home/user/data/genomes - parameters_file: Since this workflow may perform parallel processing of multiple organisms at the same time, you must prepate a tabulated file containng the genome folder names located in base data, where the hpidb files are located. Example: /home/user/data/params.tsv. It must have the following columns: genome (folder name), hpidb_seed_network (the result exported by one of the search methods available in hpidb database), hpidb_search_method (the type of search used to generate the results) and target_taxon (the target taxon id). The column hpidb_source may have two values: keyword or homology. In the keyword mode, you provide a taxonomy, protein name, publication id or detection method and you save all results (mitab.zip) in the genome folder. Finally, in the homology mode allows the user to search for host pathogen ppis giving as input fasta sequences of a set of proteins of the target pathgen for enrichment (so you have to select the search for a pathogen set) and you save the zip folder results (interaction data) in the genome folder. This option is extremely useful when you are not sure that your organism has validated protein interactions, then it finds validated interactions from the closest proteins in the database. In case of using the homology mode, the identifiers of the pathogens' query fasta sequences must be a Uniprot ID. All the query protein IDs must belong to the same target organism (taxon id). - model_file: path of a previously trained model in joblib format (if you want to train from the known validated PPIs given as seeds, just put a 'None' value) ## Usage Instructions The steps below consider the creation of a sqlite database file with all he tasks events which can be used after to retrieve the execution time taken by the tasks. It is possible run locally too (see luigi's documentation to change the running command).

* Preparation: 1. ````git clone https://github.com/YasCoMa/hppidiscovery.git```` 2. ````cd hppidiscovery```` 3. ````mkdir luigi_log```` 4. ````luigid --background --logdir luigi_log```` (start luigi server) 5. conda env create -f hp_ppi_augmentation.yml 6. conda activate hp_ppi_augmentation 6.1. (execute ````pip3 install wget```` (it is not installed in the environment)) 7. run ````pwd```` command and get the full path 8. Substitute in config_example.yaml with the full path obtained in the previous step 9. Download SPRINT pre-computed similarities in https://www.csd.uwo.ca/~ilie/SPRINT/precomputed_similarities.zip and unzip it inside workflow_hpAugmentation/predprin/core/sprint/HSP/ 10. ````cd workflow_hpAugmentation/predprin/```` 11. Uncompress annotation_data.zip 12. Uncompress sequence_data.zip 13. ````cd ../../```` 14. ````cd workflow_hpAugmentation```` 15. snake -n (check the plan of jobs, it should return no errors and exceptions) 16. snakemake -j 4 (change this number according the number of genomes to analyse and the amount of cores available in your machine)"""; "reduced_workflow.png"; "Bioinformatics, Protein-Protein interaction prediction, host-pathogen PPIs, proteins network augmentation"; ; "HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions"; "https://workflowhub.eu/projects/200"; "#snakemake"; "2023-10-19 23:56:34+00:00"; "https://about.workflowhub.eu/"; "https://workflowhub.eu/workflows/611?version=1"; "1"; a , , , . ; "101"^^xsd:integer; "https://api.rohub.org/api/resources/15c39d0d-0f43-4d64-8eba-6404356a5adf/download/"; ; "2023-10-20 08:40:51.046715+00:00"; "2023-10-20 08:41:03.530610+00:00"; "text/tab-separated-values"; ; "params_example.tsv"; "2023-10-20 08:40:51.046715+00:00"; a , . "https://ror.org/https://workflowhub.eu/workflows/611?version=1"; ; "21508"^^xsd:integer; "https://api.rohub.org/api/resources/1a309c23-b22b-4384-acd2-b47eace15095/download/"; ; "2023-10-20 08:40:51.464954+00:00"; "2023-10-20 08:41:12.435693+00:00"; "text/html"; ; "ro-crate-preview.html"; "2023-10-20 08:40:51.464954+00:00"; a , , . ; "1363"^^xsd:integer; "https://api.rohub.org/api/resources/21dfa77e-a9f7-4ad4-badb-f8dcc1bcc931/download/"; ; "2023-10-20 08:40:50.975444+00:00"; "2023-10-20 08:40:56.724742+00:00"; ; "2023-10-19T182635.539173.snakemake.log"; "2023-10-20 08:40:50.975444+00:00"; a , . ; "1064201"^^xsd:integer; "https://api.rohub.org/api/resources/24313793-4361-4407-aa07-4df8f0cee2f5/download/"; ; "2023-10-20 08:40:50.872283+00:00"; "2023-10-20 08:40:54.128939+00:00"; "application/zip"; ; "reset_input.zip"; "2023-10-20 08:40:50.872283+00:00"; a , . ; "239"^^xsd:integer; "https://api.rohub.org/api/resources/24f02bf4-0975-45cf-9f5a-ec39be7e52c5/download/"; ; "2023-10-20 08:40:50.983565+00:00"; "2023-10-20 08:40:59.133120+00:00"; "text/x-python"; ; "clean_steps.py"; "2023-10-20 08:40:50.983565+00:00"; a , . ; "83820"^^xsd:integer; "https://api.rohub.org/api/resources/3b6d6431-467c-4cb3-8a07-67a1c561d23c/download/"; ; "2023-10-20 08:40:51.464082+00:00"; "2023-10-20 08:41:06.681827+00:00"; "image/png"; ; "workflow.png"; "2023-10-20 08:40:51.464082+00:00"; a , . ; "20900"^^xsd:integer; "https://api.rohub.org/api/resources/51b0f2f4-234c-4654-99b4-17c3860a5d44/download/"; ; "2023-10-20 08:40:50.988797+00:00"; "2023-10-20 08:41:00.874151+00:00"; "text/x-python"; ; "data_preprocessing_v1.py"; "2023-10-20 08:40:50.988797+00:00"; a , . ; "110"^^xsd:integer; "https://api.rohub.org/api/resources/55ecaa21-84d9-42ec-b9bc-167cf49afa09/download/"; ; "2023-10-20 08:40:50.982729+00:00"; "2023-10-20 08:40:58.947089+00:00"; "text/x-sh"; ; "clean_outputs.sh"; "2023-10-20 08:40:50.982729+00:00"; a , . ; "24"^^xsd:integer; "https://api.rohub.org/api/resources/572775af-ed1f-4c29-8aa5-884686ab257d/download/"; ; "2023-10-20 08:40:50.977115+00:00"; "2023-10-20 08:40:57.078758+00:00"; ; "2023-10-19T183224.995853.snakemake.log"; "2023-10-20 08:40:50.977115+00:00"; a , . ; "2496"^^xsd:integer; "https://api.rohub.org/api/resources/6a7dbfa6-fc66-449f-bbe5-2b4d83cbabb3/download/"; ; "2023-10-20 08:40:51.461457+00:00"; "2023-10-20 08:41:05.738227+00:00"; ; "test_Snakefile"; "2023-10-20 08:40:51.461457+00:00"; a , . ; "10280"^^xsd:integer; "https://api.rohub.org/api/resources/6b5ce189-93d3-4817-8879-57c83efcadea/download/"; ; "2023-10-20 08:40:51.462174+00:00"; "2023-10-20 08:41:06.292490+00:00"; "text/x-python"; ; "training_ppi_augmentation.py"; "2023-10-20 08:40:51.462174+00:00"; a , . ; "5541"^^xsd:integer; "https://api.rohub.org/api/resources/6f6e8ba4-ce00-45e0-9063-30e54af32626/download/"; ; "2023-10-20 08:40:51.047714+00:00"; "2023-10-20 08:41:03.749741+00:00"; "text/markdown"; ; "readme.md"; "2023-10-20 08:40:51.047714+00:00"; a , . ; "488"^^xsd:integer; "https://api.rohub.org/api/resources/71bd00c7-2443-4a7c-b15f-63bab9d9e1aa/download/"; ; "2023-10-20 08:40:50.986013+00:00"; "2023-10-20 08:41:00.028715+00:00"; ; "config_laptop.yaml"; "2023-10-20 08:40:50.986013+00:00"; a , . ; "236"^^xsd:integer; "https://api.rohub.org/api/resources/7776b24b-a086-4195-9f8f-056836c6a0a0/download/"; ; "2023-10-20 08:40:51.045900+00:00"; "2023-10-20 08:41:03.332334+00:00"; "text/tab-separated-values"; ; "params_augmentation.tsv"; "2023-10-20 08:40:51.045900+00:00"; a , . ; "400"^^xsd:integer; "https://api.rohub.org/api/resources/7d719c89-cfba-4eca-8341-ac6daa6b370f/download/"; ; "2023-10-20 08:40:50.856652+00:00"; "2023-10-20 08:40:52.747708+00:00"; ; "CITATION.cff"; "2023-10-20 08:40:50.856652+00:00"; a , . ; "1062372"^^xsd:integer; "https://api.rohub.org/api/resources/7e5af759-e546-4f73-b221-18bdb12c38b0/download/"; ; "2023-10-20 08:40:50.884026+00:00"; "2023-10-20 08:40:54.363194+00:00"; "application/zip"; ; "results.zip"; "2023-10-20 08:40:50.884026+00:00"; a , . ; "8116"^^xsd:integer; "https://api.rohub.org/api/resources/87226596-ccc0-45e0-a273-9bf48505f929/download/"; ; "2023-10-20 08:40:50.991348+00:00"; "2023-10-20 08:41:01.394077+00:00"; ; "hp_ppi_augmentation.yml"; "2023-10-20 08:40:50.991348+00:00"; a , . ; "11043"^^xsd:integer; "https://api.rohub.org/api/resources/87333d63-a279-4521-b93e-ba87ad7af3ba/download/"; ; "2023-10-20 08:40:50.989648+00:00"; "2023-10-20 08:41:01.034064+00:00"; "text/x-python"; ; "evaluation_visualization.py"; "2023-10-20 08:40:50.989648+00:00"; a , . ; "7351"^^xsd:integer; "https://api.rohub.org/api/resources/8b8059d8-8685-4549-b8ba-45c66abbd09d/download/"; ; "2023-10-20 08:40:50.981025+00:00"; "2023-10-20 08:40:58.581977+00:00"; "application/x-python-code"; ; "evaluation_visualization.cpython-38.pyc"; "2023-10-20 08:40:50.981025+00:00"; a , . ; "172"^^xsd:integer; "https://api.rohub.org/api/resources/9456290e-07e1-4286-b1f0-1dd722a7b653/download/"; ; "2023-10-20 08:40:50.984377+00:00"; "2023-10-20 08:40:59.599519+00:00"; ; "config.yaml"; "2023-10-20 08:40:50.984377+00:00"; a , . ; "54971"^^xsd:integer; "https://api.rohub.org/api/resources/9f594136-23cb-4514-a3b0-7ca5193003cf/download/"; ; "2023-10-20 08:40:51.048967+00:00"; "2023-10-20 08:41:03.961153+00:00"; "image/png"; ; "reduced_workflow.png"; "2023-10-20 08:40:51.048967+00:00"; a , . ; "30231"^^xsd:integer; "https://api.rohub.org/api/resources/a056477a-bbc1-45d0-9ba4-3b8c289a1bb6/download/"; ; "2023-10-20 08:40:50.862139+00:00"; "2023-10-20 08:40:53.921361+00:00"; ; "model_trained.joblib"; "2023-10-20 08:40:50.862139+00:00"; a , . ; "24"^^xsd:integer; "https://api.rohub.org/api/resources/a9f6e252-5e53-42a4-8a7f-b4a97b1c40cd/download/"; ; "2023-10-20 08:40:50.976290+00:00"; "2023-10-20 08:40:56.889005+00:00"; ; "2023-10-19T182926.274815.snakemake.log"; "2023-10-20 08:40:50.976290+00:00"; a , . ; "60418600"^^xsd:integer; "https://api.rohub.org/api/resources/abc766e7-ba36-456b-87de-2f17ee5ae845/download/"; ; "2023-10-20 08:40:51.460476+00:00"; "2023-10-20 08:41:04.169671+00:00"; "application/x-tar"; ; "taxdump.tar.gz"; "2023-10-20 08:40:51.460476+00:00"; a , . ; "2238"^^xsd:integer; "https://api.rohub.org/api/resources/aef89e9f-9d50-4fde-8228-740059cb5c16/download/"; ; "2023-10-20 08:40:50.858823+00:00"; "2023-10-20 08:40:53.151499+00:00"; "text/markdown"; ; "CONTRIBUTING.md"; "2023-10-20 08:40:50.858823+00:00"; a , . ; "30231"^^xsd:integer; "https://api.rohub.org/api/resources/b0c9b02f-83cb-410f-8028-cfd13491f0d1/download/"; ; "2023-10-20 08:40:50.971316+00:00"; "2023-10-20 08:40:55.814044+00:00"; ; "model_trained.joblib"; "2023-10-20 08:40:50.971316+00:00"; a , . ; "7912"^^xsd:integer; "https://api.rohub.org/api/resources/b1c57e68-6255-41b8-b621-555a2ae094ae/download/"; ; "2023-10-20 08:40:50.981927+00:00"; "2023-10-20 08:40:58.762999+00:00"; "application/x-python-code"; ; "training_ppi_augmentation.cpython-38.pyc"; "2023-10-20 08:40:50.981927+00:00"; a , . ; "5834"^^xsd:integer; "https://api.rohub.org/api/resources/b2624fab-49ca-4a8d-aaa5-496ea8a622b2/download/"; ; "2023-10-20 08:40:50.972235+00:00"; "2023-10-20 08:40:56.050012+00:00"; "text/markdown"; ; "readme.md"; "2023-10-20 08:40:50.972235+00:00"; a , . ; "0"^^xsd:integer; "https://api.rohub.org/api/resources/bf79d072-d5db-4eb9-b7fe-602b9bb7243c/download/"; ; "2023-10-20 08:40:51.462840+00:00"; "2023-10-20 08:41:06.496060+00:00"; ; "workflow.cwl"; "2023-10-20 08:40:51.462840+00:00"; a , . ; "13756"^^xsd:integer; "https://api.rohub.org/api/resources/c0ff9b95-c133-46a2-8b43-7e9adda5a910/download/"; ; "2023-10-20 08:40:50.980132+00:00"; "2023-10-20 08:40:58.274102+00:00"; "application/x-python-code"; ; "data_preprocessing.cpython-38.pyc"; "2023-10-20 08:40:50.980132+00:00"; a , . ; "121"^^xsd:integer; "https://api.rohub.org/api/resources/c16364f9-cdaf-4f69-8b2d-24ec86e0760a/download/"; ; "2023-10-20 08:40:50.985206+00:00"; "2023-10-20 08:40:59.847114+00:00"; ; "config_example.yaml"; "2023-10-20 08:40:50.985206+00:00"; a , . ; "24"^^xsd:integer; "https://api.rohub.org/api/resources/c16775dc-e7c3-486c-990c-3be2c5ea7a1a/download/"; ; "2023-10-20 08:40:50.977925+00:00"; "2023-10-20 08:40:57.564724+00:00"; ; "2023-10-19T201611.869180.snakemake.log"; "2023-10-20 08:40:50.977925+00:00"; a , . ; "3311534"^^xsd:integer; "https://api.rohub.org/api/resources/c2131dd6-62cc-4d44-8083-704edbc6a3ce/download/"; ; "2023-10-20 08:40:51.044984+00:00"; "2023-10-20 08:41:02.435885+00:00"; "text/tab-separated-values"; ; "mapping_geneName_uniprot.tsv"; "2023-10-20 08:40:51.044984+00:00"; a , . ; "6489975"^^xsd:integer; "https://api.rohub.org/api/resources/ebc55bb6-993d-4f70-b995-17a4664bb0e9/download/"; ; "2023-10-20 08:40:51.016019+00:00"; "2023-10-20 08:41:01.639459+00:00"; "text/tab-separated-values"; ; "list_virulence_factors_full.tsv"; "2023-10-20 08:40:51.016019+00:00"; a , . ; "162"^^xsd:integer; "https://api.rohub.org/api/resources/ed49a599-f8a8-4c12-a05d-3712467babdf/download/"; ; "2023-10-20 08:40:50.855228+00:00"; "2023-10-20 08:40:52.443929+00:00"; ; ".gitignore"; "2023-10-20 08:40:50.855228+00:00"; a , . ; "54971"; "https://api.rohub.org/api/resources/ee6fb1ee-e7de-42d7-82e0-ed5e33bb3c36/download/"; ; "2023-10-20 08:40:50.973456+00:00"; "2023-10-20 08:40:56.508828+00:00"; "image/png"; ; "reduced_workflow.png"; "2023-10-20 08:40:50.973456+00:00"; a , , , . ; "22743"^^xsd:integer; "https://api.rohub.org/api/resources/f02fd4e5-6869-4afc-a651-5e912e99b841/download/"; ; "2023-10-20 08:40:50.987866+00:00"; "2023-10-20 08:41:00.412406+00:00"; "text/x-python"; ; "data_preprocessing.py"; "2023-10-20 08:40:50.987866+00:00"; a , . dct:conformsTo ; ; a . "petition"; a ; "5.155195681511471"; "19.1" . "PR"; a ; "1.989819527996298"; "4.3" . "pathogen hpidb protein"; a ; "2.049716528565198"; "4.699999999999999" . "Diseases and conditions"; a ; "Health/Diseases and conditions" . "Health"; a ; "Health" . "genome"; a ; "1.9163292847503375"; "7.1" . "protein"; a ; "8.97732531235539"; "19.4" . "mathematical and computer sciences"; a ; "16.901985704152953"; "0.3888532221317291" . "home"; a ; "2.42914979757085"; "9.0" . "computer hardware"; a ; "1.3906447534766122"; "1.1" . "organism"; a ; "5.043961129106895"; "10.9" . "target"; a ; "3.840814437760297"; "8.3" . "network"; a ; "8.005552984729292"; "17.3" . "yPublish - Bioinfo tools"; a , . } sub:provenance { sub:assertion prov:wasDerivedFrom . } sub:pubinfo { this: dct:created "2025-11-11T16:05:52.977+01:00"^^xsd:dateTime; npx:introduces ; a npx:RoCrateNanopub; rdfs:label "Research Object Crate for HPPIDiscovery - Scientific workflow to augment, predict and evaluate host-pathogen protein-protein interactions" . sub:sig npx:hasAlgorithm "RSA"; npx:hasPublicKey "MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA4pPaESKwmC6l37P86K6TNLq6yeQtc7m9CvcqauLs/1FC0viHvQnFBgxj0a+loPDv/Egwe6OqFpa0iW9Ypnyz9YPoh+pxbRXonbuMOb+8Ry9hXZ+TEKfWjhjVDGEaClwfRwglh2HI/xfV4CD9AgvDOEoZQiyta8a90PYwJ3G6e70oCHTn61+OWTkI9KRYHOYgg3btdy2Z7q/30PTFawb2ZT5aIfIJYobUYv2a7yhtcqWCHZeKv0bxGnRjTFNx1rscBMlLJSzvRtpQc1cCRVEPFZHo1adaXCI9tGvn4cxeNQ96y8dxkN1XhpaJairde+23MDzf42Oe97KG2HYzKiyVnQIDAQAB"; npx:hasSignature "cinWOjDPWIlldP+vr91FygyydE+UAGrkg6iBWmsfWLh8162t5ioZk6ypxRooTDMfkqDiZdnXipkHV/md7KyZBMgeQ1p3g2bpqQI0d+YOdMs+qFH8MK4H1Up6g0ex/QRaRbjBDgFzLKYjmoR4SNKiqCHVDMq6ychrllqBrrRU9JzJqVYATfv9eP2knwMuUt4wSyHAQSjhijM3+hKYRai4BaPtt7TSuyK4qLXDIZ0SqtGw69HuaxoP7jpJhBruUSC5NEnzyLjuslqO6Ui2lGd9vlK34flGhDAiuRsA0SUoOAIcTIoxSKcVt4fXjpS9FqF2kvaRoYi32G+iQXTUZkyiwg=="; npx:hasSignatureTarget this:; npx:signedBy . }