(C++) HtmlPage
January 25, 2018 · View on GitHub
(C++) HtmlPage



HtmlPage is a class for processing an HTML page.
Technical facts
./CppHtmlPage/CppHtmlPage.pri
INCLUDEPATH += \ ../../Classes/CppHtmlPage SOURCES += \ ../../Classes/CppHtmlPage/htmlpage.cpp HEADERS += \ ../../Classes/CppHtmlPage/htmlpage.h OTHER_FILES += \ ../../Classes/CppHtmlPage/Licence.txt
./CppHtmlPage/htmlpage.h
//--------------------------------------------------------------------------- /* HtmlPage, HTML page class Copyright 2011-2015 Richel Bilderbeek This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ //--------------------------------------------------------------------------- //From http://www.richelbilderbeek.nl/CppHtmlPage.htm //--------------------------------------------------------------------------- #ifndef HTMLPAGE_H #define HTMLPAGE_H #include <string> #include <vector> #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #include <boost/checked_delete.hpp> #pragma GCC diagnostic pop namespace ribi { struct HtmlPage { explicit HtmlPage(const std::string& filename); HtmlPage(const HtmlPage&) = delete; HtmlPage& operator=(const HtmlPage&) = delete; ///Obtain the filename of the HTML page const std::string& GetFilename() const noexcept { return m_filename; } ///Obtain the title of the HTML page const std::string& GetTitle() const noexcept { return m_title; } ///Obtain the version of this class static std::string GetVersion() noexcept; ///Obtain the version history of this class static std::vector<std::string> GetVersionHistory() noexcept; ///Replace all occurrences of a string within a string ///From http://www.richelbilderbeek.nl/CppReplaceAll.htm static std::string ReplaceAll( std::string s, const std::string& replaceWhat, const std::string& replaceWithWhat) noexcept; private: ~HtmlPage() noexcept {} friend void boost::checked_delete<>(HtmlPage* x); friend void boost::checked_delete<>(const HtmlPage* x); ///The filename of the HTML page const std::string m_filename; ///The title of the HTML page const std::string m_title; ///Find the <title> in an HTML document static std::string FindTitle(const std::string& filename) noexcept; #ifndef NDEBUG static void Test() noexcept; #endif }; bool operator<(const HtmlPage& lhs, const HtmlPage& rhs) noexcept; } //~namespace ribi #endif // HTMLPAGE_H
./CppHtmlPage/htmlpage.cpp
//--------------------------------------------------------------------------- /* HtmlPage, HTML page class Copyright 2011-2015 Richel Bilderbeek This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ //--------------------------------------------------------------------------- //From http://www.richelbilderbeek.nl/CppHtmlPage.htm //--------------------------------------------------------------------------- #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #pragma GCC diagnostic ignored "-Wunused-local-typedefs" #include "htmlpage.h" #include <fstream> #include <iostream> #include <boost/algorithm/string.hpp> #include <boost/xpressive/xpressive.hpp> #include <QDir> #include <QFile> #include "fileio.h" #pragma GCC diagnostic pop ribi::HtmlPage::HtmlPage(const std::string& filename) : m_filename{filename}, m_title{FindTitle(filename)} { #ifndef NDEBUG Test(); #endif assert(ribi::fileio::FileIo().IsRegularFile(filename)); } std::string ribi::HtmlPage::FindTitle(const std::string& filename) noexcept { const boost::xpressive::sregex title_regex = boost::xpressive::sregex::compile("<title>.*</title>"); //Copy all filenames matching the regex in the resulting std::vector const std::vector<std::string> v = ribi::fileio::FileIo().FileToVector(filename); for (const std::string s: v) { if (boost::xpressive::regex_search(s,title_regex)) { std::string t = s; //Trim leading whitespace while (!std::isgraph(t[0])) t = t.substr(1,t.size() - 1); //Trim trailing whitespace while (!std::isgraph(t[t.size()-1])) t.resize(t.size() - 1); //Extract title assert(t.substr(0,7)=="<title>"); assert(t.substr(t.size()-8,8)=="</title>"); const std::string title = t.substr(7,t.size()-8-7); const std::string title_clean = ReplaceAll(title,"&","&"); return title_clean; } } return {}; } std::string ribi::HtmlPage::GetVersion() noexcept { return "1.2"; } std::vector<std::string> ribi::HtmlPage::GetVersionHistory() noexcept { return { "2011-xx-xx: version 1.0: initial version", "2012-08-12: version 1.1: started versioning this class", "2013-09-02: version 1.2: replaced Boost.Regex by Boost.Xpressive" }; } std::string ribi::HtmlPage::ReplaceAll( std::string s, const std::string& replaceWhat, const std::string& replaceWithWhat) noexcept { while(1) { const int pos = s.find(replaceWhat); if (pos==-1) break; s.replace(pos,replaceWhat.size(),replaceWithWhat); } return s; } #ifndef NDEBUG void ribi::HtmlPage::Test() noexcept { { static bool is_tested{false}; if (is_tested) return; is_tested = true; } //Test finding { const std::string filename { "tmp.txt" }; std::ofstream file(filename); file << "Nothing"; file.close(); assert(FindTitle(filename)==""); std::remove(filename.c_str()); } { const std::string filename { "tmp.txt" }; std::ofstream file(filename); file << "Nothing\n"; file << "<title>Something</title>\n"; file << "Nothing\n"; file.close(); assert(FindTitle(filename)=="Something"); std::remove(filename.c_str()); } } #endif bool ribi::operator<(const HtmlPage& lhs, const HtmlPage& rhs) noexcept { //Case insensitive compare return boost::algorithm::to_lower_copy(lhs.GetTitle()) < boost::algorithm::to_lower_copy(rhs.GetTitle()); }